annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/mummer-3.23/src/tigr/postnuc.cc @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 //------------------------------------------------------------------------------
jpayne@69 2 // Programmer: Adam M Phillippy, The Institute for Genomic Research
jpayne@69 3 // File: postnuc.cc
jpayne@69 4 // Date: 07 / 16 / 2002
jpayne@69 5 //
jpayne@69 6 // Revision: 08 / 01 / 2002
jpayne@69 7 // Added MUM extension functionality
jpayne@69 8 //
jpayne@69 9 // Purpose: To translate the coordinates referencing the concatenated
jpayne@69 10 // reference sequences back to the individual sequences and deal
jpayne@69 11 // with any conflict that may have arisen (i.e. a MUM that spans
jpayne@69 12 // the boundry between two sequences). Then to extend each cluster
jpayne@69 13 // via Smith-Waterman techniques to expand the alignment coverage.
jpayne@69 14 // Alignments which encounter each other will be fused to form one
jpayne@69 15 // encompasing alignment when appropriate.
jpayne@69 16 //
jpayne@69 17 // Input: Input is the output of the .mgaps program from stdin. On the
jpayne@69 18 // command line, the file names of the two original sequence files
jpayne@69 19 // should come first, followed by the prefix <pfx> that should be
jpayne@69 20 // placed in front of the two output filenames <pfx>.cluster and
jpayne@69 21 // <pfx>.delta
jpayne@69 22 //
jpayne@69 23 // NOTE: Cluster file is now suppressed by default (see -d option).
jpayne@69 24 //
jpayne@69 25 // Output: Output is to two output files, <pfx>.cluster and <pfx>.delta.
jpayne@69 26 // <pfx>.cluster lists MUM clusters as identified by "mgaps".
jpayne@69 27 // However, the clusters now reference their corresponding
jpayne@69 28 // sequence and are all listed under headers that specify this
jpayne@69 29 // sequence. The <pfx>.delta file is the alignment object that
jpayne@69 30 // contains all the information necessary to reproduce the alignments
jpayne@69 31 // generated by the MUM extension process. Please refer to the
jpayne@69 32 // output file documentation for an in-depth description of these
jpayne@69 33 // file formats.
jpayne@69 34 //
jpayne@69 35 // Usage: postnuc <reference> <query> <pfx> < <input>
jpayne@69 36 // Where <reference> and <query> are the original input sequences of
jpayne@69 37 // NUCmer and <pfx> is the prefix that should be added to the
jpayne@69 38 // beginning of the <pfx>.cluster and <pfx>.delta output filenames.
jpayne@69 39 //
jpayne@69 40 //------------------------------------------------------------------------------
jpayne@69 41
jpayne@69 42 //-- NOTE: this option will significantly hamper program performance,
jpayne@69 43 // mostly the alignment extension performance (sw_align.h)
jpayne@69 44 //#define _DEBUG_ASSERT // self testing assert functions
jpayne@69 45
jpayne@69 46 #include "tigrinc.hh"
jpayne@69 47 #include "sw_align.hh"
jpayne@69 48 #include <vector>
jpayne@69 49 #include <algorithm>
jpayne@69 50 using namespace std;
jpayne@69 51
jpayne@69 52
jpayne@69 53 //------------------------------------------------------ Globals -------------//
jpayne@69 54 const signed char FORWARD_CHAR = 1;
jpayne@69 55 const signed char REVERSE_CHAR = -1;
jpayne@69 56
jpayne@69 57
jpayne@69 58 bool DO_DELTA = true;
jpayne@69 59 bool DO_EXTEND = true;
jpayne@69 60 bool TO_SEQEND = false;
jpayne@69 61 bool DO_SHADOWS = false;
jpayne@69 62
jpayne@69 63
jpayne@69 64 //------------------------------------------------------ Type Definitions ----//
jpayne@69 65 enum LineType
jpayne@69 66 //-- The type of input line from <stdin>
jpayne@69 67 {
jpayne@69 68 NO_LINE, HEADER_LINE, MATCH_LINE
jpayne@69 69 };
jpayne@69 70
jpayne@69 71
jpayne@69 72 struct FastaRecord
jpayne@69 73 //-- The essential data of a sequence
jpayne@69 74 {
jpayne@69 75 char * Id; // the fasta ID header tag
jpayne@69 76 long int len; // the length of the sequence
jpayne@69 77 char * seq; // the sequence data
jpayne@69 78 };
jpayne@69 79
jpayne@69 80
jpayne@69 81 struct Match
jpayne@69 82 //-- An exact match between two sequences A and B
jpayne@69 83 {
jpayne@69 84 long int sA, sB, len; // start coordinate in A, in B and the length
jpayne@69 85 };
jpayne@69 86
jpayne@69 87
jpayne@69 88 struct Cluster
jpayne@69 89 //-- An ordered list of matches between two sequences A and B
jpayne@69 90 {
jpayne@69 91 bool wasFused; // have the cluster matches been fused yet?
jpayne@69 92 signed char dirB; // the query sequence direction
jpayne@69 93 // FORWARD_CHAR or REVERSE_CHAR
jpayne@69 94 vector<Match> matches; // the ordered set of matches in the cluster
jpayne@69 95 };
jpayne@69 96
jpayne@69 97
jpayne@69 98 struct Synteny
jpayne@69 99 //-- An ordered list of clusters between two sequences A and B
jpayne@69 100 {
jpayne@69 101 FastaRecord * AfP; // a pointer to the reference sequence record
jpayne@69 102 FastaRecord Bf; // the query sequence record (w/o the sequence)
jpayne@69 103 vector<Cluster> clusters; // the ordered set of clusters between A and B
jpayne@69 104 };
jpayne@69 105
jpayne@69 106
jpayne@69 107 struct Alignment
jpayne@69 108 //-- An alignment object between two sequences A and B
jpayne@69 109 {
jpayne@69 110 signed char dirB; // the query sequence direction
jpayne@69 111 long int sA, sB, eA, eB; // the start in A, B and the end in A, B
jpayne@69 112 vector<long int> delta; // the delta values, with NO zero at the end
jpayne@69 113 long int deltaApos; // sum of abs(deltas) - #of negative deltas
jpayne@69 114 // trust me, it is a very helpful value
jpayne@69 115 long int Errors, SimErrors, NonAlphas; // errors, similarity errors, nonalphas
jpayne@69 116 };
jpayne@69 117
jpayne@69 118
jpayne@69 119 struct AscendingClusterSort
jpayne@69 120 //-- For sorting clusters in ascending order of their sA coordinate
jpayne@69 121 {
jpayne@69 122 bool operator() (const Cluster & pA, const Cluster & pB)
jpayne@69 123 {
jpayne@69 124 return ( pA.matches.begin( )->sA < pB.matches.begin( )->sA );
jpayne@69 125 }
jpayne@69 126 };
jpayne@69 127
jpayne@69 128
jpayne@69 129
jpayne@69 130
jpayne@69 131 //------------------------------------------------- Function Declarations ----//
jpayne@69 132 void addNewAlignment
jpayne@69 133 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
jpayne@69 134 vector<Match>::iterator Mp);
jpayne@69 135
jpayne@69 136 bool extendBackward
jpayne@69 137 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
jpayne@69 138 vector<Alignment>::iterator TargetAp, const char * A, const char * B);
jpayne@69 139
jpayne@69 140 bool extendForward
jpayne@69 141 (vector<Alignment>::iterator Ap, const char * A, long int targetA,
jpayne@69 142 const char * B, long int targetB, unsigned int m_o);
jpayne@69 143
jpayne@69 144 void extendClusters
jpayne@69 145 (vector<Cluster> & Clusters,
jpayne@69 146 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile);
jpayne@69 147
jpayne@69 148 void flushAlignments
jpayne@69 149 (vector<Alignment> & Alignments,
jpayne@69 150 const FastaRecord * Af, const FastaRecord * Bf,
jpayne@69 151 FILE * DeltaFile);
jpayne@69 152
jpayne@69 153 void flushSyntenys
jpayne@69 154 (vector<Synteny> & Syntenys, FILE * ClusterFile);
jpayne@69 155
jpayne@69 156 vector<Cluster>::iterator getForwardTargetCluster
jpayne@69 157 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
jpayne@69 158 long int & targetA, long int & targetB);
jpayne@69 159
jpayne@69 160 vector<Alignment>::iterator getReverseTargetAlignment
jpayne@69 161 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp);
jpayne@69 162
jpayne@69 163 bool isShadowedCluster
jpayne@69 164 (vector<Cluster>::iterator CurrCp,
jpayne@69 165 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap);
jpayne@69 166
jpayne@69 167 void parseAbort
jpayne@69 168 (const char *);
jpayne@69 169
jpayne@69 170 void parseDelta
jpayne@69 171 (vector<Alignment> & Alignments,
jpayne@69 172 const FastaRecord * Af, const FastaRecord *Bf);
jpayne@69 173
jpayne@69 174 void processSyntenys
jpayne@69 175 (vector<Synteny> & Syntenys,
jpayne@69 176 FastaRecord * Af, long int As,
jpayne@69 177 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile);
jpayne@69 178
jpayne@69 179 inline long int revC
jpayne@69 180 (long int Coord, long int Len);
jpayne@69 181
jpayne@69 182 void printHelp
jpayne@69 183 (const char *);
jpayne@69 184
jpayne@69 185 void printUsage
jpayne@69 186 (const char *);
jpayne@69 187
jpayne@69 188 void validateData
jpayne@69 189 (vector<Alignment> Alignments, vector<Cluster> Clusters,
jpayne@69 190 const FastaRecord * Af, const FastaRecord * Bf);
jpayne@69 191
jpayne@69 192
jpayne@69 193
jpayne@69 194
jpayne@69 195 //-------------------------------------------------- Function Definitions ----//
jpayne@69 196 int main
jpayne@69 197 (int argc, char ** argv)
jpayne@69 198 {
jpayne@69 199 FastaRecord * Af; // array of all the reference sequences
jpayne@69 200
jpayne@69 201 vector<Synteny> Syntenys; // vector of all sets of clusters
jpayne@69 202 vector<Synteny>::reverse_iterator CurrSp; // current set of clusters
jpayne@69 203 vector<Synteny>::reverse_iterator Sp; // temporary synteny pointer
jpayne@69 204
jpayne@69 205 Synteny Asyn; // a single set of clusters
jpayne@69 206 Cluster Aclu; // a single cluster of matches
jpayne@69 207 Match Amat; // a single match
jpayne@69 208
jpayne@69 209 LineType PrevLine; // the current input line
jpayne@69 210
jpayne@69 211 bool Found; // temporary search flag
jpayne@69 212
jpayne@69 213 char Line [MAX_LINE]; // a single line of input
jpayne@69 214 char CurrIdB [MAX_LINE], IdA [MAX_LINE], IdB [MAX_LINE]; // fasta ID headers
jpayne@69 215 char ClusterFileName [MAX_LINE], DeltaFileName [MAX_LINE]; // file names
jpayne@69 216 char RefFileName [MAX_LINE], QryFileName [MAX_LINE];
jpayne@69 217
jpayne@69 218 signed char DirB = FORWARD_CHAR; // the current query strand direction
jpayne@69 219
jpayne@69 220 long int i; // temporary counter
jpayne@69 221 long int Seqi; // current reference sequence index
jpayne@69 222 long int InitSize; // initial sequence memory space
jpayne@69 223 long int As = 0; // the size of the reference array
jpayne@69 224 long int Ac = 100; // the capacity of the reference array
jpayne@69 225 long int sA, sB, len; // current match start in A, B and length
jpayne@69 226
jpayne@69 227 FILE * RefFile, * QryFile; // reference and query input files
jpayne@69 228 FILE * ClusterFile, * DeltaFile; // cluster and delta output files
jpayne@69 229
jpayne@69 230 //-- Set the alignment data type and break length (sw_align.h)
jpayne@69 231 setMatrixType ( NUCLEOTIDE );
jpayne@69 232 setBreakLen ( 200 );
jpayne@69 233 setBanding ( 0 );
jpayne@69 234
jpayne@69 235 //-- Parse the command line arguments
jpayne@69 236 {
jpayne@69 237 optarg = NULL;
jpayne@69 238 int ch, errflg = 0;
jpayne@69 239 while ( !errflg && ((ch = getopt (argc, argv, "dehB:b:st")) != EOF) )
jpayne@69 240 switch (ch)
jpayne@69 241 {
jpayne@69 242 case 'b' :
jpayne@69 243 setBreakLen( atoi (optarg) );
jpayne@69 244 break;
jpayne@69 245
jpayne@69 246 case 'B' :
jpayne@69 247 setBanding( atoi (optarg) );
jpayne@69 248 break;
jpayne@69 249
jpayne@69 250 case 'd' :
jpayne@69 251 DO_DELTA = false;
jpayne@69 252 break;
jpayne@69 253
jpayne@69 254 case 'e' :
jpayne@69 255 DO_EXTEND = false;
jpayne@69 256 break;
jpayne@69 257
jpayne@69 258 case 'h' :
jpayne@69 259 printHelp (argv[0]);
jpayne@69 260 exit (EXIT_SUCCESS);
jpayne@69 261 break;
jpayne@69 262
jpayne@69 263 case 's' :
jpayne@69 264 DO_SHADOWS = true;
jpayne@69 265 break;
jpayne@69 266
jpayne@69 267 case 't' :
jpayne@69 268 TO_SEQEND = true;
jpayne@69 269 break;
jpayne@69 270
jpayne@69 271 default :
jpayne@69 272 errflg ++;
jpayne@69 273 }
jpayne@69 274
jpayne@69 275 if ( errflg > 0 || argc - optind != 3 )
jpayne@69 276 {
jpayne@69 277 printUsage (argv[0]);
jpayne@69 278 exit (EXIT_FAILURE);
jpayne@69 279 }
jpayne@69 280 }
jpayne@69 281
jpayne@69 282 //-- Read and create the I/O file names
jpayne@69 283 strcpy (RefFileName, argv[optind ++]);
jpayne@69 284 strcpy (QryFileName, argv[optind ++]);
jpayne@69 285 strcpy (ClusterFileName, argv[optind ++]);
jpayne@69 286 strcpy (DeltaFileName, ClusterFileName);
jpayne@69 287 strcat (ClusterFileName, ".cluster");
jpayne@69 288 strcat (DeltaFileName, ".delta");
jpayne@69 289
jpayne@69 290 //-- Open all the files
jpayne@69 291 RefFile = File_Open (RefFileName, "r");
jpayne@69 292 QryFile = File_Open (QryFileName, "r");
jpayne@69 293 if ( DO_DELTA )
jpayne@69 294 {
jpayne@69 295 ClusterFile = NULL;
jpayne@69 296 DeltaFile = File_Open (DeltaFileName, "w");
jpayne@69 297 }
jpayne@69 298 else
jpayne@69 299 {
jpayne@69 300 ClusterFile = File_Open (ClusterFileName, "w");
jpayne@69 301 DeltaFile = NULL;
jpayne@69 302 }
jpayne@69 303
jpayne@69 304 //-- Print the headers of the output files
jpayne@69 305 if ( DO_DELTA )
jpayne@69 306 fprintf (DeltaFile, "%s %s\nNUCMER\n", RefFileName, QryFileName);
jpayne@69 307 else
jpayne@69 308 fprintf (ClusterFile, "%s %s\nNUCMER\n", RefFileName, QryFileName);
jpayne@69 309
jpayne@69 310 //-- Generate the array of the reference sequences
jpayne@69 311 InitSize = INIT_SIZE;
jpayne@69 312 Af = (FastaRecord *) Safe_malloc ( sizeof(FastaRecord) * Ac );
jpayne@69 313 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
jpayne@69 314 while ( Read_String (RefFile, Af[As].seq, InitSize, IdA, FALSE) )
jpayne@69 315 {
jpayne@69 316 Af[As].Id = (char *) Safe_malloc (sizeof(char) * (strlen(IdA) + 1));
jpayne@69 317 strcpy (Af[As].Id, IdA);
jpayne@69 318
jpayne@69 319 Af[As].len = strlen (Af[As].seq + 1);
jpayne@69 320
jpayne@69 321 if ( ++ As >= Ac )
jpayne@69 322 {
jpayne@69 323 Ac *= 2;
jpayne@69 324 Af = (FastaRecord *) Safe_realloc ( Af, sizeof(FastaRecord) * Ac );
jpayne@69 325 }
jpayne@69 326
jpayne@69 327 InitSize = INIT_SIZE;
jpayne@69 328 Af[As].seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
jpayne@69 329 }
jpayne@69 330 fclose (RefFile);
jpayne@69 331 if ( As <= 0 )
jpayne@69 332 parseAbort ( RefFileName );
jpayne@69 333
jpayne@69 334
jpayne@69 335
jpayne@69 336 //-- Process the input from <stdin> line by line
jpayne@69 337 PrevLine = NO_LINE;
jpayne@69 338 IdA[0] = '\0';
jpayne@69 339 IdB[0] = '\0';
jpayne@69 340 CurrIdB[0] = '\0';
jpayne@69 341 while ( fgets(Line, MAX_LINE, stdin) != NULL )
jpayne@69 342 {
jpayne@69 343
jpayne@69 344 //-- If the current line is a fasta HEADER_LINE
jpayne@69 345 if ( Line[0] == '>' )
jpayne@69 346 {
jpayne@69 347 if ( sscanf (Line + 1, "%s", CurrIdB) != 1 )
jpayne@69 348 parseAbort ("stdin");
jpayne@69 349 DirB = strstr (Line," Reverse") == NULL ? FORWARD_CHAR : REVERSE_CHAR;
jpayne@69 350
jpayne@69 351 PrevLine = HEADER_LINE;
jpayne@69 352 }
jpayne@69 353
jpayne@69 354
jpayne@69 355 //-- If the current line is a cluster HEADER_LINE
jpayne@69 356 else if ( Line[0] == '#' )
jpayne@69 357 {
jpayne@69 358 PrevLine = HEADER_LINE;
jpayne@69 359 }
jpayne@69 360
jpayne@69 361
jpayne@69 362 //-- If the current line is a MATCH_LINE
jpayne@69 363 else
jpayne@69 364 {
jpayne@69 365 if ( sscanf (Line, "%ld %ld %ld", &sA, &sB, &len) != 3 )
jpayne@69 366 parseAbort ("stdin");
jpayne@69 367
jpayne@69 368 //-- Re-map the reference coordinate back to its original sequence
jpayne@69 369 for ( Seqi = 0; sA > Af[Seqi].len; Seqi ++ )
jpayne@69 370 sA -= Af[Seqi].len + 1; // extra +1 for the x at the end of each seq
jpayne@69 371 if ( Seqi >= As )
jpayne@69 372 {
jpayne@69 373 fprintf (stderr,
jpayne@69 374 "ERROR: A MUM was found with a start coordinate greater than\n"
jpayne@69 375 " the sequence length, a serious error has occured.\n"
jpayne@69 376 " Please file a bug report\n");
jpayne@69 377 exit (EXIT_FAILURE);
jpayne@69 378 }
jpayne@69 379
jpayne@69 380 //-- If the match spans across a sequence boundry
jpayne@69 381 if ( sA + len - 1 > Af[Seqi].len || sA <= 0)
jpayne@69 382 {
jpayne@69 383 fprintf (stderr,
jpayne@69 384 "WARNING: A MUM was found extending beyond the boundry of:\n"
jpayne@69 385 " Reference sequence '>%s'\n\n"
jpayne@69 386 "Please check that the '-n' option is activated on 'mummer2'\n"
jpayne@69 387 "and try again, or file a bug report\n"
jpayne@69 388 "Attempting to continue.\n", Af[Seqi].Id);
jpayne@69 389 continue;
jpayne@69 390 }
jpayne@69 391
jpayne@69 392 //-- Check and update the current synteny region
jpayne@69 393 if ( strcmp (IdA, Af[Seqi].Id) != 0 || strcmp (IdB, CurrIdB) != 0 )
jpayne@69 394 {
jpayne@69 395 Found = false;
jpayne@69 396 if ( strcmp (IdB, CurrIdB) == 0 )
jpayne@69 397 {
jpayne@69 398 //-- Has this header been seen before?
jpayne@69 399 for ( Sp = Syntenys.rbegin( ); Sp < Syntenys.rend( ); Sp ++ )
jpayne@69 400 {
jpayne@69 401 if ( strcmp (Sp->AfP->Id, Af[Seqi].Id) == 0 )
jpayne@69 402 {
jpayne@69 403 if ( Sp->AfP->len != Af[Seqi].len )
jpayne@69 404 {
jpayne@69 405 fprintf (stderr,
jpayne@69 406 "ERROR: The reference file may contain"
jpayne@69 407 " sequences with non-unique\n"
jpayne@69 408 " header Ids, please check your input"
jpayne@69 409 " files and try again\n");
jpayne@69 410 exit (EXIT_FAILURE);
jpayne@69 411 }
jpayne@69 412 assert ( strcmp (Sp->Bf.Id, IdB) == 0 );
jpayne@69 413 CurrSp = Sp;
jpayne@69 414 Found = true;
jpayne@69 415 break;
jpayne@69 416 }
jpayne@69 417 }
jpayne@69 418 }
jpayne@69 419 else
jpayne@69 420 {
jpayne@69 421 //-- New B sequence header, process all the old synteny's
jpayne@69 422 processSyntenys (Syntenys, Af, As,
jpayne@69 423 QryFile, ClusterFile, DeltaFile);
jpayne@69 424 }
jpayne@69 425
jpayne@69 426 strcpy (IdA, Af[Seqi].Id);
jpayne@69 427 strcpy (IdB, CurrIdB);
jpayne@69 428
jpayne@69 429 //-- If not seen yet, create a new synteny region
jpayne@69 430 if ( ! Found )
jpayne@69 431 {
jpayne@69 432 Asyn.AfP = Af + Seqi;
jpayne@69 433 Asyn.Bf.len = -1;
jpayne@69 434 Asyn.Bf.Id = (char *) Safe_malloc
jpayne@69 435 (sizeof(char) * (strlen(IdB) + 1));
jpayne@69 436 strcpy (Asyn.Bf.Id, IdB);
jpayne@69 437
jpayne@69 438 Syntenys.push_back (Asyn);
jpayne@69 439 CurrSp = Syntenys.rbegin( );
jpayne@69 440 }
jpayne@69 441
jpayne@69 442 //-- Add a new cluster to the current synteny
jpayne@69 443 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
jpayne@69 444 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
jpayne@69 445 CurrSp->clusters.pop_back( ); // hack to remove empties
jpayne@69 446 Aclu.wasFused = false;
jpayne@69 447 Aclu.dirB = DirB;
jpayne@69 448 CurrSp->clusters.push_back (Aclu);
jpayne@69 449 }
jpayne@69 450 else if ( PrevLine == HEADER_LINE )
jpayne@69 451 {
jpayne@69 452 //-- Add a new cluster to the current synteny
jpayne@69 453 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
jpayne@69 454 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
jpayne@69 455 CurrSp->clusters.pop_back( );
jpayne@69 456 Aclu.wasFused = false;
jpayne@69 457 Aclu.dirB = DirB;
jpayne@69 458 CurrSp->clusters.push_back (Aclu);
jpayne@69 459 }
jpayne@69 460
jpayne@69 461 //-- Add a new match to the current cluster
jpayne@69 462 if ( len > 1 )
jpayne@69 463 {
jpayne@69 464 Amat.sA = sA;
jpayne@69 465 Amat.sB = sB;
jpayne@69 466 Amat.len = len;
jpayne@69 467 CurrSp->clusters.rbegin( )->matches.push_back (Amat);
jpayne@69 468 }
jpayne@69 469
jpayne@69 470 PrevLine = MATCH_LINE;
jpayne@69 471 }
jpayne@69 472 }
jpayne@69 473
jpayne@69 474 //-- Process the left-over syntenys
jpayne@69 475 if ( !Syntenys.empty( ) && !CurrSp->clusters.empty( ) )
jpayne@69 476 if ( CurrSp->clusters.rbegin( )->matches.empty( ) )
jpayne@69 477 CurrSp->clusters.pop_back( );
jpayne@69 478
jpayne@69 479 processSyntenys (Syntenys, Af, As, QryFile, ClusterFile, DeltaFile);
jpayne@69 480 fclose (QryFile);
jpayne@69 481
jpayne@69 482 //-- Free the reference sequences
jpayne@69 483 for ( i = 0; i < As; i ++ )
jpayne@69 484 {
jpayne@69 485 free (Af[i].Id);
jpayne@69 486 free (Af[i].seq);
jpayne@69 487 }
jpayne@69 488 free (Af);
jpayne@69 489
jpayne@69 490 return EXIT_SUCCESS;
jpayne@69 491 }
jpayne@69 492
jpayne@69 493
jpayne@69 494
jpayne@69 495
jpayne@69 496 void addNewAlignment
jpayne@69 497 (vector<Alignment> & Alignments, vector<Cluster>::iterator Cp,
jpayne@69 498 vector<Match>::iterator Mp)
jpayne@69 499
jpayne@69 500 // Create and add a new alignment object based on the current match
jpayne@69 501 // and cluster information pointed to by Cp and Mp.
jpayne@69 502
jpayne@69 503 {
jpayne@69 504 Alignment Align;
jpayne@69 505
jpayne@69 506 //-- Initialize the data
jpayne@69 507 Align.sA = Mp->sA;
jpayne@69 508 Align.sB = Mp->sB;
jpayne@69 509 Align.eA = Mp->sA + Mp->len - 1;
jpayne@69 510 Align.eB = Mp->sB + Mp->len - 1;
jpayne@69 511 Align.dirB = Cp->dirB;
jpayne@69 512 Align.delta.clear( );
jpayne@69 513 Align.deltaApos = 0;
jpayne@69 514
jpayne@69 515 //-- Add the alignment object
jpayne@69 516 Alignments.push_back (Align);
jpayne@69 517
jpayne@69 518 return;
jpayne@69 519 }
jpayne@69 520
jpayne@69 521
jpayne@69 522
jpayne@69 523
jpayne@69 524 bool extendBackward
jpayne@69 525 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp,
jpayne@69 526 vector<Alignment>::iterator TargetAp, const char * A, const char * B)
jpayne@69 527
jpayne@69 528 // Extend an alignment backwards off of the current alignment object.
jpayne@69 529 // The current alignment object must be freshly created and consist
jpayne@69 530 // only of an exact match (i.e. the delta vector MUST be empty).
jpayne@69 531 // If the TargetAp alignment object is reached by the extension, it will
jpayne@69 532 // be merged with CurrAp and CurrAp will be destroyed. If TargetAp is
jpayne@69 533 // NULL the function will extend as far as possible. It is a strange
jpayne@69 534 // and dangerous function because it can delete CurrAp, so edit with
jpayne@69 535 // caution. Returns true if TargetAp was reached and merged, else false
jpayne@69 536 // Designed only as a subroutine for extendClusters, should be used
jpayne@69 537 // nowhere else.
jpayne@69 538
jpayne@69 539 {
jpayne@69 540 bool target_reached = false;
jpayne@69 541 bool overflow_flag = false;
jpayne@69 542 bool double_flag = false;
jpayne@69 543
jpayne@69 544 vector<long int>::iterator Dp;
jpayne@69 545
jpayne@69 546 unsigned int m_o;
jpayne@69 547 long int targetA, targetB;
jpayne@69 548
jpayne@69 549 m_o = BACKWARD_SEARCH;
jpayne@69 550
jpayne@69 551 //-- Set the target coordinates
jpayne@69 552 if ( TargetAp != Alignments.end( ) )
jpayne@69 553 {
jpayne@69 554 targetA = TargetAp->eA;
jpayne@69 555 targetB = TargetAp->eB;
jpayne@69 556 }
jpayne@69 557 else
jpayne@69 558 {
jpayne@69 559 targetA = 1;
jpayne@69 560 targetB = 1;
jpayne@69 561 m_o |= OPTIMAL_BIT;
jpayne@69 562 }
jpayne@69 563
jpayne@69 564 //-- If alignment is too long, bring with bounds and set overflow_flag true
jpayne@69 565 if ( CurrAp->sA - targetA + 1 > MAX_ALIGNMENT_LENGTH )
jpayne@69 566 {
jpayne@69 567 targetA = CurrAp->sA - MAX_ALIGNMENT_LENGTH + 1;
jpayne@69 568 overflow_flag = true;
jpayne@69 569 m_o |= OPTIMAL_BIT;
jpayne@69 570 }
jpayne@69 571 if ( CurrAp->sB - targetB + 1 > MAX_ALIGNMENT_LENGTH )
jpayne@69 572 {
jpayne@69 573 targetB = CurrAp->sB - MAX_ALIGNMENT_LENGTH + 1;
jpayne@69 574 if ( overflow_flag )
jpayne@69 575 double_flag = true;
jpayne@69 576 else
jpayne@69 577 overflow_flag = true;
jpayne@69 578 m_o |= OPTIMAL_BIT;
jpayne@69 579 }
jpayne@69 580
jpayne@69 581
jpayne@69 582 if ( TO_SEQEND && !double_flag )
jpayne@69 583 m_o |= SEQEND_BIT;
jpayne@69 584
jpayne@69 585
jpayne@69 586 //-- Attempt to reach the target
jpayne@69 587 target_reached = alignSearch (A, CurrAp->sA, targetA,
jpayne@69 588 B, CurrAp->sB, targetB, m_o);
jpayne@69 589
jpayne@69 590 if ( overflow_flag || TargetAp == Alignments.end( ) )
jpayne@69 591 target_reached = false;
jpayne@69 592
jpayne@69 593 if ( target_reached )
jpayne@69 594 {
jpayne@69 595 //-- Merge the two alignment objects
jpayne@69 596 extendForward (TargetAp, A, CurrAp->sA,
jpayne@69 597 B, CurrAp->sB, FORCED_FORWARD_ALIGN);
jpayne@69 598 TargetAp->eA = CurrAp->eA;
jpayne@69 599 TargetAp->eB = CurrAp->eB;
jpayne@69 600 Alignments.pop_back( );
jpayne@69 601 }
jpayne@69 602 else
jpayne@69 603 {
jpayne@69 604 alignTarget (A, targetA, CurrAp->sA,
jpayne@69 605 B, targetB, CurrAp->sB,
jpayne@69 606 CurrAp->delta, FORCED_FORWARD_ALIGN);
jpayne@69 607 CurrAp->sA = targetA;
jpayne@69 608 CurrAp->sB = targetB;
jpayne@69 609
jpayne@69 610 //-- Update the deltaApos value for the alignment object
jpayne@69 611 for ( Dp = CurrAp->delta.begin( ); Dp < CurrAp->delta.end( ); Dp ++ )
jpayne@69 612 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
jpayne@69 613 }
jpayne@69 614
jpayne@69 615 return target_reached;
jpayne@69 616 }
jpayne@69 617
jpayne@69 618
jpayne@69 619
jpayne@69 620
jpayne@69 621 bool extendForward
jpayne@69 622 (vector<Alignment>::iterator CurrAp, const char * A, long int targetA,
jpayne@69 623 const char * B, long int targetB, unsigned int m_o)
jpayne@69 624
jpayne@69 625 // Extend an alignment forwards off the current alignment object until
jpayne@69 626 // target or end of sequence is reached, and merge the delta values of the
jpayne@69 627 // alignment object with the new delta values generated by the extension.
jpayne@69 628 // Return true if the target was reached, else false
jpayne@69 629
jpayne@69 630 {
jpayne@69 631 long int ValA;
jpayne@69 632 long int ValB;
jpayne@69 633 unsigned int Di;
jpayne@69 634 bool target_reached;
jpayne@69 635 bool overflow_flag = false;
jpayne@69 636 bool double_flag = false;
jpayne@69 637 vector<long int>::iterator Dp;
jpayne@69 638
jpayne@69 639 //-- Set Di to the end of the delta vector
jpayne@69 640 Di = CurrAp->delta.size( );
jpayne@69 641
jpayne@69 642 //-- Calculate the distance between the start and end positions
jpayne@69 643 ValA = targetA - CurrAp->eA + 1;
jpayne@69 644 ValB = targetB - CurrAp->eB + 1;
jpayne@69 645
jpayne@69 646 //-- If the distance is too long, shrink it and set the overflow_flag
jpayne@69 647 if ( ValA > MAX_ALIGNMENT_LENGTH )
jpayne@69 648 {
jpayne@69 649 targetA = CurrAp->eA + MAX_ALIGNMENT_LENGTH - 1;
jpayne@69 650 overflow_flag = true;
jpayne@69 651 m_o |= OPTIMAL_BIT;
jpayne@69 652 }
jpayne@69 653 if ( ValB > MAX_ALIGNMENT_LENGTH )
jpayne@69 654 {
jpayne@69 655 targetB = CurrAp->eB + MAX_ALIGNMENT_LENGTH - 1;
jpayne@69 656 if ( overflow_flag )
jpayne@69 657 double_flag = true;
jpayne@69 658 else
jpayne@69 659 overflow_flag = true;
jpayne@69 660 m_o |= OPTIMAL_BIT;
jpayne@69 661 }
jpayne@69 662
jpayne@69 663 if ( double_flag )
jpayne@69 664 m_o &= ~SEQEND_BIT;
jpayne@69 665
jpayne@69 666 target_reached = alignTarget (A, CurrAp->eA, targetA,
jpayne@69 667 B, CurrAp->eB, targetB,
jpayne@69 668 CurrAp->delta, m_o);
jpayne@69 669
jpayne@69 670 //-- Notify user if alignment was chopped short
jpayne@69 671 if ( target_reached && overflow_flag )
jpayne@69 672 target_reached = false;
jpayne@69 673
jpayne@69 674 //-- Pick up delta where left off (Di) and merge with new deltas
jpayne@69 675 if ( Di < CurrAp->delta.size( ) )
jpayne@69 676 {
jpayne@69 677 //-- Merge the deltas
jpayne@69 678 ValA = (CurrAp->eA - CurrAp->sA + 1) - CurrAp->deltaApos - 1;
jpayne@69 679 CurrAp->delta[Di] += CurrAp->delta[Di] > 0 ? ValA : -(ValA);
jpayne@69 680 if ( CurrAp->delta[Di] == 0 || ValA < 0 )
jpayne@69 681 {
jpayne@69 682 fprintf(stderr,
jpayne@69 683 "ERROR: failed to merge alignments at position %ld\n"
jpayne@69 684 " Please file a bug report\n",
jpayne@69 685 CurrAp->eA);
jpayne@69 686 exit (EXIT_FAILURE);
jpayne@69 687 }
jpayne@69 688
jpayne@69 689 //-- Update the deltaApos
jpayne@69 690 for ( Dp = CurrAp->delta.begin( ) + Di; Dp < CurrAp->delta.end( ); Dp ++ )
jpayne@69 691 CurrAp->deltaApos += *Dp > 0 ? *Dp : labs(*Dp) - 1;
jpayne@69 692 }
jpayne@69 693
jpayne@69 694 //-- Set the alignment coordinates
jpayne@69 695 CurrAp->eA = targetA;
jpayne@69 696 CurrAp->eB = targetB;
jpayne@69 697
jpayne@69 698 return target_reached;
jpayne@69 699 }
jpayne@69 700
jpayne@69 701
jpayne@69 702
jpayne@69 703
jpayne@69 704 void extendClusters
jpayne@69 705 (vector<Cluster> & Clusters,
jpayne@69 706 const FastaRecord * Af, const FastaRecord * Bf, FILE * DeltaFile)
jpayne@69 707
jpayne@69 708 // Connect all the matches in every cluster between sequences A and B.
jpayne@69 709 // Also, extend alignments off of the front and back of each cluster to
jpayne@69 710 // expand total alignment coverage. When these extensions encounter an
jpayne@69 711 // adjacent cluster, fuse the two regions to create one single
jpayne@69 712 // encompassing region. This routine will create alignment objects from
jpayne@69 713 // these extensions and output the resulting delta information to the
jpayne@69 714 // delta output file.
jpayne@69 715
jpayne@69 716 {
jpayne@69 717 //-- Sort the clusters (ascending) by their start coordinate in sequence A
jpayne@69 718 sort (Clusters.begin( ), Clusters.end( ), AscendingClusterSort( ));
jpayne@69 719
jpayne@69 720
jpayne@69 721 //-- If no delta file is requested
jpayne@69 722 if ( ! DO_DELTA )
jpayne@69 723 return;
jpayne@69 724
jpayne@69 725
jpayne@69 726 bool target_reached = false; // reached the adjacent match or cluster
jpayne@69 727
jpayne@69 728 char * A, * B; // the sequences A and B
jpayne@69 729 char * Brev = NULL; // the reverse complement of B
jpayne@69 730
jpayne@69 731 unsigned int m_o;
jpayne@69 732 long int targetA, targetB; // alignment extension targets in A and B
jpayne@69 733
jpayne@69 734 vector<Match>::iterator Mp; // match pointer
jpayne@69 735
jpayne@69 736 vector<Cluster>::iterator PrevCp; // where the extensions last left off
jpayne@69 737 vector<Cluster>::iterator CurrCp; // the current cluster being extended
jpayne@69 738 vector<Cluster>::iterator TargetCp = Clusters.end( ); // the target cluster
jpayne@69 739
jpayne@69 740 vector<Alignment> Alignments; // the vector of alignment objects
jpayne@69 741 vector<Alignment>::iterator CurrAp = Alignments.begin( ); // current align
jpayne@69 742 vector<Alignment>::iterator TargetAp; // target align
jpayne@69 743
jpayne@69 744
jpayne@69 745 //-- Extend each cluster
jpayne@69 746 A = Af->seq;
jpayne@69 747 PrevCp = Clusters.begin( );
jpayne@69 748 CurrCp = Clusters.begin( );
jpayne@69 749 while ( CurrCp < Clusters.end( ) )
jpayne@69 750 {
jpayne@69 751 if ( DO_EXTEND )
jpayne@69 752 {
jpayne@69 753 //-- Ignore if shadowed or already extended
jpayne@69 754 if ( ! target_reached )
jpayne@69 755 if ( CurrCp->wasFused
jpayne@69 756 ||
jpayne@69 757 (!DO_SHADOWS &&
jpayne@69 758 isShadowedCluster (CurrCp, Alignments, CurrAp)) )
jpayne@69 759 {
jpayne@69 760 CurrCp->wasFused = true;
jpayne@69 761 CurrCp = ++ PrevCp;
jpayne@69 762 continue;
jpayne@69 763 }
jpayne@69 764 }
jpayne@69 765
jpayne@69 766 //-- Pick the right directional sequence for B
jpayne@69 767 if ( CurrCp->dirB == FORWARD_CHAR )
jpayne@69 768 B = Bf->seq;
jpayne@69 769 else if ( Brev != NULL )
jpayne@69 770 B = Brev;
jpayne@69 771 else
jpayne@69 772 {
jpayne@69 773 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
jpayne@69 774 strcpy ( Brev + 1, Bf->seq + 1 );
jpayne@69 775 Brev[0] = '\0';
jpayne@69 776 Reverse_Complement (Brev, 1, Bf->len);
jpayne@69 777 B = Brev;
jpayne@69 778 }
jpayne@69 779
jpayne@69 780 //-- Extend each match in the cluster
jpayne@69 781 for ( Mp = CurrCp->matches.begin( ); Mp < CurrCp->matches.end( ); Mp ++ )
jpayne@69 782 {
jpayne@69 783 //-- Try to extend the current match backwards
jpayne@69 784 if ( target_reached )
jpayne@69 785 {
jpayne@69 786 //-- Merge with the previous match
jpayne@69 787 if ( CurrAp->eA != Mp->sA || CurrAp->eB != Mp->sB )
jpayne@69 788 {
jpayne@69 789 if ( Mp >= CurrCp->matches.end( ) - 1 )
jpayne@69 790 {
jpayne@69 791 fprintf (stderr,
jpayne@69 792 "ERROR: Target match does not exist, please\n"
jpayne@69 793 " file a bug report\n");
jpayne@69 794 exit (EXIT_FAILURE);
jpayne@69 795 }
jpayne@69 796 continue;
jpayne@69 797 }
jpayne@69 798 CurrAp->eA += Mp->len - 1;
jpayne@69 799 CurrAp->eB += Mp->len - 1;
jpayne@69 800 }
jpayne@69 801 else
jpayne@69 802 {
jpayne@69 803 //-- Create a new alignment object
jpayne@69 804 addNewAlignment (Alignments, CurrCp, Mp);
jpayne@69 805 CurrAp = Alignments.end( ) - 1;
jpayne@69 806
jpayne@69 807 if ( DO_EXTEND || Mp != CurrCp->matches.begin ( ) )
jpayne@69 808 {
jpayne@69 809 //-- Target the closest/best alignment object
jpayne@69 810 TargetAp = getReverseTargetAlignment (Alignments, CurrAp);
jpayne@69 811
jpayne@69 812 //-- Extend the new alignment object backwards
jpayne@69 813 if ( extendBackward (Alignments, CurrAp, TargetAp, A, B) )
jpayne@69 814 CurrAp = TargetAp;
jpayne@69 815 }
jpayne@69 816 }
jpayne@69 817
jpayne@69 818 m_o = FORWARD_ALIGN;
jpayne@69 819
jpayne@69 820 //-- Try to extend the current match forwards
jpayne@69 821 if ( Mp < CurrCp->matches.end( ) - 1 )
jpayne@69 822 {
jpayne@69 823 //-- Target the next match in the cluster
jpayne@69 824 targetA = (Mp + 1)->sA;
jpayne@69 825 targetB = (Mp + 1)->sB;
jpayne@69 826
jpayne@69 827 //-- Extend the current alignment object forward
jpayne@69 828 target_reached = extendForward (CurrAp, A, targetA,
jpayne@69 829 B, targetB, m_o);
jpayne@69 830 }
jpayne@69 831 else if ( DO_EXTEND )
jpayne@69 832 {
jpayne@69 833 targetA = Af->len;
jpayne@69 834 targetB = Bf->len;
jpayne@69 835
jpayne@69 836 //-- Target the closest/best match in a future cluster
jpayne@69 837 TargetCp = getForwardTargetCluster (Clusters, CurrCp,
jpayne@69 838 targetA, targetB);
jpayne@69 839 if ( TargetCp == Clusters.end( ) )
jpayne@69 840 {
jpayne@69 841 m_o |= OPTIMAL_BIT;
jpayne@69 842 if ( TO_SEQEND )
jpayne@69 843 m_o |= SEQEND_BIT;
jpayne@69 844 }
jpayne@69 845
jpayne@69 846 //-- Extend the current alignment object forward
jpayne@69 847 target_reached = extendForward (CurrAp, A, targetA,
jpayne@69 848 B, targetB, m_o);
jpayne@69 849 }
jpayne@69 850 }
jpayne@69 851 if ( TargetCp == Clusters.end( ) )
jpayne@69 852 target_reached = false;
jpayne@69 853
jpayne@69 854 CurrCp->wasFused = true;
jpayne@69 855
jpayne@69 856 if ( target_reached == false )
jpayne@69 857 CurrCp = ++ PrevCp;
jpayne@69 858 else
jpayne@69 859 CurrCp = TargetCp;
jpayne@69 860 }
jpayne@69 861
jpayne@69 862 #ifdef _DEBUG_ASSERT
jpayne@69 863 validateData (Alignments, Clusters, Af, Bf);
jpayne@69 864 #endif
jpayne@69 865
jpayne@69 866 //-- Output the alignment data to the delta file
jpayne@69 867 flushAlignments (Alignments, Af, Bf, DeltaFile);
jpayne@69 868
jpayne@69 869 if ( Brev != NULL )
jpayne@69 870 free (Brev);
jpayne@69 871
jpayne@69 872 return;
jpayne@69 873 }
jpayne@69 874
jpayne@69 875
jpayne@69 876
jpayne@69 877
jpayne@69 878 void flushAlignments
jpayne@69 879 (vector<Alignment> & Alignments,
jpayne@69 880 const FastaRecord * Af, const FastaRecord * Bf,
jpayne@69 881 FILE * DeltaFile)
jpayne@69 882
jpayne@69 883 // Simply output the delta information stored in Alignments to the
jpayne@69 884 // given delta file. Free the memory used by Alignments once the
jpayne@69 885 // data is successfully output to the file.
jpayne@69 886
jpayne@69 887 {
jpayne@69 888 vector<Alignment>::iterator Ap; // alignment pointer
jpayne@69 889 vector<long int>::iterator Dp; // delta pointer
jpayne@69 890
jpayne@69 891 fprintf (DeltaFile, ">%s %s %ld %ld\n", Af->Id, Bf->Id, Af->len, Bf->len);
jpayne@69 892
jpayne@69 893 //-- Generate the error counts
jpayne@69 894 parseDelta (Alignments, Af, Bf);
jpayne@69 895
jpayne@69 896 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
jpayne@69 897 {
jpayne@69 898 if ( Ap->dirB == FORWARD_CHAR )
jpayne@69 899 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
jpayne@69 900 Ap->sA, Ap->eA,
jpayne@69 901 Ap->sB, Ap->eB,
jpayne@69 902 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
jpayne@69 903 else
jpayne@69 904 fprintf (DeltaFile, "%ld %ld %ld %ld %ld %ld %ld\n",
jpayne@69 905 Ap->sA, Ap->eA,
jpayne@69 906 revC (Ap->sB, Bf->len), revC (Ap->eB, Bf->len),
jpayne@69 907 Ap->Errors, Ap->SimErrors, Ap->NonAlphas);
jpayne@69 908
jpayne@69 909 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
jpayne@69 910 fprintf (DeltaFile, "%ld\n", *Dp);
jpayne@69 911 fprintf (DeltaFile, "0\n");
jpayne@69 912
jpayne@69 913 Ap->delta.clear( );
jpayne@69 914 }
jpayne@69 915
jpayne@69 916 Alignments.clear( );
jpayne@69 917
jpayne@69 918 return;
jpayne@69 919 }
jpayne@69 920
jpayne@69 921
jpayne@69 922
jpayne@69 923
jpayne@69 924 void flushSyntenys
jpayne@69 925 (vector<Synteny> & Syntenys, FILE * ClusterFile)
jpayne@69 926
jpayne@69 927 // Simply output the synteny/cluster information generated by the mgaps
jpayne@69 928 // program. However, now the coordinates reference their appropriate
jpayne@69 929 // reference sequence, and the reference sequecne header is added to
jpayne@69 930 // the appropriate lines. Free the memory used by Syntenys once the
jpayne@69 931 // data is successfully output to the file.
jpayne@69 932
jpayne@69 933 {
jpayne@69 934 vector<Synteny>::iterator Sp; // synteny pointer
jpayne@69 935 vector<Cluster>::iterator Cp; // cluster pointer
jpayne@69 936 vector<Match>::iterator Mp; // match pointer
jpayne@69 937
jpayne@69 938 if ( ClusterFile )
jpayne@69 939 {
jpayne@69 940 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
jpayne@69 941 {
jpayne@69 942 fprintf (ClusterFile, ">%s %s %ld %ld\n", Sp->AfP->Id, Sp->Bf.Id,
jpayne@69 943 Sp->AfP->len, Sp->Bf.len);
jpayne@69 944 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
jpayne@69 945 {
jpayne@69 946 fprintf (ClusterFile, "%2d %2d\n", FORWARD_CHAR, Cp->dirB);
jpayne@69 947 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
jpayne@69 948 {
jpayne@69 949 if ( Cp->dirB == FORWARD_CHAR )
jpayne@69 950 fprintf (ClusterFile, "%8ld %8ld %6ld",
jpayne@69 951 Mp->sA, Mp->sB, Mp->len);
jpayne@69 952 else
jpayne@69 953 fprintf (ClusterFile, "%8ld %8ld %6ld",
jpayne@69 954 Mp->sA, revC (Mp->sB, Sp->Bf.len), Mp->len);
jpayne@69 955 if ( Mp != Cp->matches.begin( ) )
jpayne@69 956 fprintf (ClusterFile, "%6ld %6ld\n",
jpayne@69 957 Mp->sA - (Mp - 1)->sA - (Mp - 1)->len,
jpayne@69 958 Mp->sB - (Mp - 1)->sB - (Mp - 1)->len);
jpayne@69 959 else
jpayne@69 960 fprintf (ClusterFile, "%6s %6s\n", "-", "-");
jpayne@69 961 }
jpayne@69 962 Cp->matches.clear( );
jpayne@69 963 }
jpayne@69 964 Sp->clusters.clear( );
jpayne@69 965 }
jpayne@69 966 }
jpayne@69 967 else
jpayne@69 968 {
jpayne@69 969 for ( Sp = Syntenys.begin( ); Sp < Syntenys.end( ); Sp ++ )
jpayne@69 970 {
jpayne@69 971 for ( Cp = Sp->clusters.begin( ); Cp < Sp->clusters.end( ); Cp ++ )
jpayne@69 972 {
jpayne@69 973 Cp->matches.clear( );
jpayne@69 974 }
jpayne@69 975 Sp->clusters.clear( );
jpayne@69 976 }
jpayne@69 977 }
jpayne@69 978
jpayne@69 979 Syntenys.clear( );
jpayne@69 980
jpayne@69 981 return;
jpayne@69 982 }
jpayne@69 983
jpayne@69 984
jpayne@69 985
jpayne@69 986
jpayne@69 987 vector<Cluster>::iterator getForwardTargetCluster
jpayne@69 988 (vector<Cluster> & Clusters, vector<Cluster>::iterator CurrCp,
jpayne@69 989 long int & targetA, long int & targetB)
jpayne@69 990
jpayne@69 991 // Return the cluster that is most likely to successfully join (in a
jpayne@69 992 // forward direction) with the current cluster. The returned cluster
jpayne@69 993 // must contain 1 or more matches that are strictly greater than the end
jpayne@69 994 // of the current cluster. The targeted cluster must also be on a
jpayne@69 995 // diagonal close enough to the current cluster, so that a connection
jpayne@69 996 // could possibly be made by the alignment extender. Assumes clusters
jpayne@69 997 // have been sorted via AscendingClusterSort. Returns targeted cluster
jpayne@69 998 // and stores the target coordinates in targetA and targetB. If no
jpayne@69 999 // suitable cluster was found, the function will return NULL and target
jpayne@69 1000 // A and targetB will remain unchanged.
jpayne@69 1001
jpayne@69 1002 {
jpayne@69 1003 vector<Match>::iterator Mip; // match iteratrive pointer
jpayne@69 1004 vector<Cluster>::iterator Cp; // cluster pointer
jpayne@69 1005 vector<Cluster>::iterator Cip; // cluster iterative pointer
jpayne@69 1006 long int eA, eB; // possible target
jpayne@69 1007 long int greater, lesser; // gap sizes between two clusters
jpayne@69 1008 long int sA = CurrCp->matches.rbegin( )->sA +
jpayne@69 1009 CurrCp->matches.rbegin( )->len - 1; // the endA of the current cluster
jpayne@69 1010 long int sB = CurrCp->matches.rbegin( )->sB +
jpayne@69 1011 CurrCp->matches.rbegin( )->len - 1; // the endB of the current cluster
jpayne@69 1012
jpayne@69 1013 //-- End of sequences is the default target, set distance accordingly
jpayne@69 1014 long int dist = (targetA - sA < targetB - sB ? targetA - sA : targetB - sB);
jpayne@69 1015
jpayne@69 1016 //-- For all clusters greater than the current cluster (on sequence A)
jpayne@69 1017 Cp = Clusters.end( );
jpayne@69 1018 for ( Cip = CurrCp + 1; Cip < Clusters.end( ); Cip ++ )
jpayne@69 1019 {
jpayne@69 1020 //-- If the cluster is on the same direction
jpayne@69 1021 if ( CurrCp->dirB == Cip->dirB )
jpayne@69 1022 {
jpayne@69 1023 eA = Cip->matches.begin( )->sA;
jpayne@69 1024 eB = Cip->matches.begin( )->sB;
jpayne@69 1025
jpayne@69 1026 //-- If the cluster overlaps the current cluster, strip some matches
jpayne@69 1027 if ( ( eA < sA || eB < sB ) &&
jpayne@69 1028 Cip->matches.rbegin( )->sA >= sA &&
jpayne@69 1029 Cip->matches.rbegin( )->sB >= sB )
jpayne@69 1030 {
jpayne@69 1031 for ( Mip = Cip->matches.begin( );
jpayne@69 1032 Mip < Cip->matches.end( ) && ( eA < sA || eB < sB );
jpayne@69 1033 Mip ++ )
jpayne@69 1034 {
jpayne@69 1035 eA = Mip->sA;
jpayne@69 1036 eB = Mip->sB;
jpayne@69 1037 }
jpayne@69 1038 }
jpayne@69 1039
jpayne@69 1040 //-- If the cluster is strictly greater than current cluster
jpayne@69 1041 if ( eA >= sA && eB >= sB )
jpayne@69 1042 {
jpayne@69 1043 if ( eA - sA > eB - sB )
jpayne@69 1044 {
jpayne@69 1045 greater = eA - sA;
jpayne@69 1046 lesser = eB - sB;
jpayne@69 1047 }
jpayne@69 1048 else
jpayne@69 1049 {
jpayne@69 1050 lesser = eA - sA;
jpayne@69 1051 greater = eB - sB;
jpayne@69 1052 }
jpayne@69 1053
jpayne@69 1054 //-- If the cluster is close enough
jpayne@69 1055 if ( greater < getBreakLen( ) ||
jpayne@69 1056 (lesser) * GOOD_SCORE [getMatrixType( )] +
jpayne@69 1057 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 )
jpayne@69 1058 {
jpayne@69 1059 Cp = Cip;
jpayne@69 1060 targetA = eA;
jpayne@69 1061 targetB = eB;
jpayne@69 1062 break;
jpayne@69 1063 }
jpayne@69 1064 else if ( (greater << 1) - lesser < dist )
jpayne@69 1065 {
jpayne@69 1066 Cp = Cip;
jpayne@69 1067 targetA = eA;
jpayne@69 1068 targetB = eB;
jpayne@69 1069 dist = (greater << 1) - lesser;
jpayne@69 1070 }
jpayne@69 1071 }
jpayne@69 1072 }
jpayne@69 1073 }
jpayne@69 1074
jpayne@69 1075
jpayne@69 1076 return Cp;
jpayne@69 1077 }
jpayne@69 1078
jpayne@69 1079
jpayne@69 1080
jpayne@69 1081
jpayne@69 1082 vector<Alignment>::iterator getReverseTargetAlignment
jpayne@69 1083 (vector<Alignment> & Alignments, vector<Alignment>::iterator CurrAp)
jpayne@69 1084
jpayne@69 1085 // Return the alignment that is most likely to successfully join (in a
jpayne@69 1086 // reverse direction) with the current alignment. The returned alignment
jpayne@69 1087 // must be strictly less than the current cluster and be on a diagonal
jpayne@69 1088 // close enough to the current alignment, so that a connection
jpayne@69 1089 // could possibly be made by the alignment extender. Assumes clusters
jpayne@69 1090 // have been sorted via AscendingClusterSort and processed in order, so
jpayne@69 1091 // therefore all alignments are in order by their start A coordinate.
jpayne@69 1092
jpayne@69 1093 {
jpayne@69 1094 vector<Alignment>::iterator Ap; // alignment pointer
jpayne@69 1095 vector<Alignment>::iterator Aip; // alignment iterative pointer
jpayne@69 1096 long int eA, eB; // possible targets
jpayne@69 1097 long int greater, lesser; // gap sizes between the two alignments
jpayne@69 1098 long int sA = CurrAp->sA; // the startA of the current alignment
jpayne@69 1099 long int sB = CurrAp->sB; // the startB of the current alignment
jpayne@69 1100
jpayne@69 1101 //-- Beginning of sequences is the default target, set distance accordingly
jpayne@69 1102 long int dist = (sA < sB ? sA : sB);
jpayne@69 1103
jpayne@69 1104 //-- For all alignments less than the current alignment (on sequence A)
jpayne@69 1105 Ap = Alignments.end( );
jpayne@69 1106 for ( Aip = CurrAp - 1; Aip >= Alignments.begin( ); Aip -- )
jpayne@69 1107 {
jpayne@69 1108 //-- If the alignment is on the same direction
jpayne@69 1109 if ( CurrAp->dirB == Aip->dirB )
jpayne@69 1110 {
jpayne@69 1111 eA = Aip->eA;
jpayne@69 1112 eB = Aip->eB;
jpayne@69 1113
jpayne@69 1114 //-- If the alignment is strictly less than current cluster
jpayne@69 1115 if ( eA <= sA && eB <= sB )
jpayne@69 1116 {
jpayne@69 1117 if ( sA - eA > sB - eB )
jpayne@69 1118 {
jpayne@69 1119 greater = sA - eA;
jpayne@69 1120 lesser = sB - eB;
jpayne@69 1121 }
jpayne@69 1122 else
jpayne@69 1123 {
jpayne@69 1124 lesser = sA - eA;
jpayne@69 1125 greater = sB - eB;
jpayne@69 1126 }
jpayne@69 1127
jpayne@69 1128 //-- If the cluster is close enough
jpayne@69 1129 if ( greater < getBreakLen( ) ||
jpayne@69 1130 (lesser) * GOOD_SCORE [getMatrixType( )] +
jpayne@69 1131 (greater - lesser) * CONT_GAP_SCORE [getMatrixType( )] >= 0 )
jpayne@69 1132 {
jpayne@69 1133 Ap = Aip;
jpayne@69 1134 break;
jpayne@69 1135 }
jpayne@69 1136 else if ( (greater << 1) - lesser < dist )
jpayne@69 1137 {
jpayne@69 1138 Ap = Aip;
jpayne@69 1139 dist = (greater << 1) - lesser;
jpayne@69 1140 }
jpayne@69 1141 }
jpayne@69 1142 }
jpayne@69 1143 }
jpayne@69 1144
jpayne@69 1145
jpayne@69 1146 return Ap;
jpayne@69 1147 }
jpayne@69 1148
jpayne@69 1149
jpayne@69 1150
jpayne@69 1151
jpayne@69 1152 bool isShadowedCluster
jpayne@69 1153 (vector<Cluster>::iterator CurrCp,
jpayne@69 1154 vector<Alignment> & Alignments, vector<Alignment>::iterator Ap)
jpayne@69 1155
jpayne@69 1156 // Check if the current cluster is shadowed by a previously produced
jpayne@69 1157 // alignment region. Return true if it is, else false.
jpayne@69 1158
jpayne@69 1159 {
jpayne@69 1160 vector<Alignment>::iterator Aip; // alignment pointer
jpayne@69 1161
jpayne@69 1162 long int sA = CurrCp->matches.begin( )->sA;
jpayne@69 1163 long int eA = CurrCp->matches.rbegin( )->sA +
jpayne@69 1164 CurrCp->matches.rbegin( )->len - 1;
jpayne@69 1165 long int sB = CurrCp->matches.begin( )->sB;
jpayne@69 1166 long int eB = CurrCp->matches.rbegin( )->sB +
jpayne@69 1167 CurrCp->matches.rbegin( )->len - 1;
jpayne@69 1168
jpayne@69 1169 if ( ! Alignments.empty( ) ) // if there are alignments to use
jpayne@69 1170 {
jpayne@69 1171 //-- Look backwards in hope of finding a shadowing alignment
jpayne@69 1172 for ( Aip = Ap; Aip >= Alignments.begin( ); Aip -- )
jpayne@69 1173 {
jpayne@69 1174 //-- If in the same direction and shadowing the current cluster, break
jpayne@69 1175 if ( Aip->dirB == CurrCp->dirB )
jpayne@69 1176 if ( Aip->eA >= eA && Aip->eB >= eB )
jpayne@69 1177 if ( Aip->sA <= sA && Aip->sB <= sB )
jpayne@69 1178 break;
jpayne@69 1179 }
jpayne@69 1180
jpayne@69 1181 //-- Return true if the loop was broken, i.e. shadow found
jpayne@69 1182 if ( Aip >= Alignments.begin( ) )
jpayne@69 1183 return true;
jpayne@69 1184 }
jpayne@69 1185
jpayne@69 1186 //-- Return false if Alignments was empty or loop was not broken
jpayne@69 1187 return false;
jpayne@69 1188 }
jpayne@69 1189
jpayne@69 1190
jpayne@69 1191
jpayne@69 1192
jpayne@69 1193 void parseAbort
jpayne@69 1194 (const char * s)
jpayne@69 1195
jpayne@69 1196 // Abort the program if there was an error in parsing file 's'
jpayne@69 1197
jpayne@69 1198 {
jpayne@69 1199 fprintf (stderr,
jpayne@69 1200 "ERROR: Could not parse input from '%s'. \n"
jpayne@69 1201 "Please check the filename and format, or file a bug report\n", s);
jpayne@69 1202 exit (EXIT_FAILURE);
jpayne@69 1203 }
jpayne@69 1204
jpayne@69 1205
jpayne@69 1206
jpayne@69 1207
jpayne@69 1208 void parseDelta
jpayne@69 1209 (vector<Alignment> & Alignments,
jpayne@69 1210 const FastaRecord * Af, const FastaRecord *Bf)
jpayne@69 1211
jpayne@69 1212 // Use the delta information to generate the error counts for each
jpayne@69 1213 // alignment, and fill this information into the data type
jpayne@69 1214
jpayne@69 1215 {
jpayne@69 1216 char * A, * B, * Brev = NULL;
jpayne@69 1217 char ch1, ch2;
jpayne@69 1218 long int Delta;
jpayne@69 1219 int Sign;
jpayne@69 1220 long int i, Apos, Bpos;
jpayne@69 1221 long int Remain, Total;
jpayne@69 1222 long int Errors, SimErrors;
jpayne@69 1223 long int NonAlphas;
jpayne@69 1224 vector<Alignment>::iterator Ap;
jpayne@69 1225 vector<long int>::iterator Dp;
jpayne@69 1226
jpayne@69 1227 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
jpayne@69 1228 {
jpayne@69 1229 A = Af->seq;
jpayne@69 1230 B = Bf->seq;
jpayne@69 1231
jpayne@69 1232 if ( Ap->dirB == REVERSE_CHAR )
jpayne@69 1233 {
jpayne@69 1234 if ( Brev == NULL )
jpayne@69 1235 {
jpayne@69 1236 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
jpayne@69 1237 strcpy ( Brev + 1, Bf->seq + 1 );
jpayne@69 1238 Brev[0] = '\0';
jpayne@69 1239 Reverse_Complement (Brev, 1, Bf->len);
jpayne@69 1240 B = Brev;
jpayne@69 1241 }
jpayne@69 1242 B = Brev;
jpayne@69 1243 }
jpayne@69 1244
jpayne@69 1245 Apos = Ap->sA;
jpayne@69 1246 Bpos = Ap->sB;
jpayne@69 1247
jpayne@69 1248 Errors = 0;
jpayne@69 1249 SimErrors = 0;
jpayne@69 1250 NonAlphas = 0;
jpayne@69 1251 Remain = Ap->eA - Ap->sA + 1;
jpayne@69 1252 Total = Remain;
jpayne@69 1253
jpayne@69 1254 //-- For all delta's in this alignment
jpayne@69 1255 for ( Dp = Ap->delta.begin( ); Dp < Ap->delta.end( ); Dp ++ )
jpayne@69 1256 {
jpayne@69 1257 Delta = *Dp;
jpayne@69 1258 Sign = Delta > 0 ? 1 : -1;
jpayne@69 1259 Delta = labs ( Delta );
jpayne@69 1260
jpayne@69 1261 //-- For all the bases before the next indel
jpayne@69 1262 for ( i = 1; i < Delta; i ++ )
jpayne@69 1263 {
jpayne@69 1264 ch1 = A [Apos ++];
jpayne@69 1265 ch2 = B [Bpos ++];
jpayne@69 1266
jpayne@69 1267 if ( !isalpha (ch1) )
jpayne@69 1268 {
jpayne@69 1269 ch1 = STOP_CHAR;
jpayne@69 1270 NonAlphas ++;
jpayne@69 1271 }
jpayne@69 1272 if ( !isalpha (ch2) )
jpayne@69 1273 {
jpayne@69 1274 ch2 = STOP_CHAR;
jpayne@69 1275 NonAlphas ++;
jpayne@69 1276 }
jpayne@69 1277
jpayne@69 1278 ch1 = toupper(ch1);
jpayne@69 1279 ch2 = toupper(ch2);
jpayne@69 1280 if ( 1 > MATCH_SCORE
jpayne@69 1281 [getMatrixType( )]
jpayne@69 1282 [ch1 - 'A']
jpayne@69 1283 [ch2 - 'A'] )
jpayne@69 1284 SimErrors ++;
jpayne@69 1285 if ( ch1 != ch2 )
jpayne@69 1286 Errors ++;
jpayne@69 1287 }
jpayne@69 1288
jpayne@69 1289 //-- Process the current indel
jpayne@69 1290 Remain -= i - 1;
jpayne@69 1291 Errors ++;
jpayne@69 1292 SimErrors ++;
jpayne@69 1293
jpayne@69 1294 if ( Sign == 1 )
jpayne@69 1295 {
jpayne@69 1296 Apos ++;
jpayne@69 1297 Remain --;
jpayne@69 1298 }
jpayne@69 1299 else
jpayne@69 1300 {
jpayne@69 1301 Bpos ++;
jpayne@69 1302 Total ++;
jpayne@69 1303 }
jpayne@69 1304 }
jpayne@69 1305
jpayne@69 1306 //-- For all the bases after the final indel
jpayne@69 1307 for ( i = 0; i < Remain; i ++ )
jpayne@69 1308 {
jpayne@69 1309 //-- Score character match and update error counters
jpayne@69 1310 ch1 = A [Apos ++];
jpayne@69 1311 ch2 = B [Bpos ++];
jpayne@69 1312
jpayne@69 1313 if ( !isalpha (ch1) )
jpayne@69 1314 {
jpayne@69 1315 ch1 = STOP_CHAR;
jpayne@69 1316 NonAlphas ++;
jpayne@69 1317 }
jpayne@69 1318 if ( !isalpha (ch2) )
jpayne@69 1319 {
jpayne@69 1320 ch2 = STOP_CHAR;
jpayne@69 1321 NonAlphas ++;
jpayne@69 1322 }
jpayne@69 1323
jpayne@69 1324 ch1 = toupper(ch1);
jpayne@69 1325 ch2 = toupper(ch2);
jpayne@69 1326 if ( 1 > MATCH_SCORE
jpayne@69 1327 [getMatrixType( )]
jpayne@69 1328 [ch1 - 'A']
jpayne@69 1329 [ch2 - 'A'] )
jpayne@69 1330 SimErrors ++;
jpayne@69 1331 if ( ch1 != ch2 )
jpayne@69 1332 Errors ++;
jpayne@69 1333 }
jpayne@69 1334
jpayne@69 1335 Ap->Errors = Errors;
jpayne@69 1336 Ap->SimErrors = SimErrors;
jpayne@69 1337 Ap->NonAlphas = NonAlphas;
jpayne@69 1338 }
jpayne@69 1339
jpayne@69 1340 if ( Brev != NULL )
jpayne@69 1341 free ( Brev );
jpayne@69 1342
jpayne@69 1343 return;
jpayne@69 1344 }
jpayne@69 1345
jpayne@69 1346
jpayne@69 1347
jpayne@69 1348
jpayne@69 1349 void processSyntenys
jpayne@69 1350 (vector<Synteny> & Syntenys, FastaRecord * Af, long int As,
jpayne@69 1351 FILE * QryFile, FILE * ClusterFile, FILE * DeltaFile)
jpayne@69 1352
jpayne@69 1353 // For each syntenic region with clusters, read in the B sequence and
jpayne@69 1354 // extend the clusters to expand total alignment coverage. Only should
jpayne@69 1355 // be called once all the clusters for the contained syntenic regions have
jpayne@69 1356 // been stored in the data structure. Frees the memory used by the
jpayne@69 1357 // the syntenic regions once the output of extendClusters and
jpayne@69 1358 // flushSyntenys has been produced.
jpayne@69 1359
jpayne@69 1360 {
jpayne@69 1361 FastaRecord Bf; // the current B sequence
jpayne@69 1362 Bf.Id = (char *) Safe_malloc (sizeof(char) * (MAX_LINE + 1));
jpayne@69 1363
jpayne@69 1364 long int InitSize = INIT_SIZE; // the initial memory capacity of B
jpayne@69 1365 vector<Synteny>::iterator CurrSp; // current synteny pointer
jpayne@69 1366
jpayne@69 1367 //-- Initialize the B sequence storage
jpayne@69 1368 Bf.seq = (char *) Safe_malloc ( sizeof(char) * InitSize );
jpayne@69 1369
jpayne@69 1370 //-- For all the contained syntenys
jpayne@69 1371 for ( CurrSp = Syntenys.begin( ); CurrSp < Syntenys.end( ); CurrSp ++ )
jpayne@69 1372 {
jpayne@69 1373 //-- If no clusters, ignore
jpayne@69 1374 if ( CurrSp->clusters.empty( ) )
jpayne@69 1375 continue;
jpayne@69 1376
jpayne@69 1377 //-- If a B sequence not seen yet, read it in
jpayne@69 1378 //-- IMPORTANT: The B sequences in the synteny object are assumed to be
jpayne@69 1379 // ordered as output by mgaps, if they are not in order the program
jpayne@69 1380 // will fail. (All like tags must be adjacent and in the same order
jpayne@69 1381 // as the query file)
jpayne@69 1382 if ( CurrSp == Syntenys.begin( ) ||
jpayne@69 1383 strcmp (CurrSp->Bf.Id, (CurrSp-1)->Bf.Id) != 0 )
jpayne@69 1384 {
jpayne@69 1385 //-- Read in the B sequence
jpayne@69 1386 while ( Read_String (QryFile, Bf.seq, InitSize, Bf.Id, FALSE) )
jpayne@69 1387 if ( strcmp (CurrSp->Bf.Id, Bf.Id) == 0 )
jpayne@69 1388 break;
jpayne@69 1389 if ( strcmp (CurrSp->Bf.Id, Bf.Id) != 0 )
jpayne@69 1390 parseAbort ( "Query File" );
jpayne@69 1391 Bf.len = strlen (Bf.seq + 1);
jpayne@69 1392 }
jpayne@69 1393
jpayne@69 1394 //-- Extend clusters and create the alignment information
jpayne@69 1395 CurrSp->Bf.len = Bf.len;
jpayne@69 1396 extendClusters (CurrSp->clusters, CurrSp->AfP, &Bf, DeltaFile);
jpayne@69 1397 }
jpayne@69 1398
jpayne@69 1399 //-- Create the cluster information
jpayne@69 1400 flushSyntenys (Syntenys, ClusterFile);
jpayne@69 1401
jpayne@69 1402 free (Bf.Id);
jpayne@69 1403 free (Bf.seq);
jpayne@69 1404
jpayne@69 1405 return;
jpayne@69 1406 }
jpayne@69 1407
jpayne@69 1408
jpayne@69 1409
jpayne@69 1410
jpayne@69 1411 inline long int revC
jpayne@69 1412 (long int Coord, long int Len)
jpayne@69 1413
jpayne@69 1414 // Reverse complement the given coordinate for the given length.
jpayne@69 1415
jpayne@69 1416 {
jpayne@69 1417 assert (Len - Coord + 1 > 0);
jpayne@69 1418 return (Len - Coord + 1);
jpayne@69 1419 }
jpayne@69 1420
jpayne@69 1421
jpayne@69 1422
jpayne@69 1423
jpayne@69 1424 void printHelp
jpayne@69 1425 (const char * s)
jpayne@69 1426
jpayne@69 1427 // Display the program's help information to stderr.
jpayne@69 1428
jpayne@69 1429 {
jpayne@69 1430 fprintf (stderr,
jpayne@69 1431 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
jpayne@69 1432 fprintf (stderr,
jpayne@69 1433 "-b int set the alignment break (give-up) length to int\n"
jpayne@69 1434 "-B int set the diagonal banding for extension to int\n"
jpayne@69 1435 "-d output only match clusters rather than extended alignments\n"
jpayne@69 1436 "-e do not extend alignments outward from clusters\n"
jpayne@69 1437 "-h display help information\n"
jpayne@69 1438 "-s don't remove shadowed alignments, useful for aligning a\n"
jpayne@69 1439 " sequence to itself to identify repeats\n"
jpayne@69 1440 "-t force alignment to ends of sequence if within -b distance\n\n");
jpayne@69 1441 fprintf (stderr,
jpayne@69 1442 " Input is the output of the \"mgaps\" program from stdin, and\n"
jpayne@69 1443 "the two original NUCmer sequence files passed on the command\n"
jpayne@69 1444 "line. <pfx> is the prefix to be added to the front of the\n"
jpayne@69 1445 "output file <pfx>.delta\n"
jpayne@69 1446 " <pfx>.delta is the alignment object that catalogs the distance\n"
jpayne@69 1447 "between insertions and deletions. For further information\n"
jpayne@69 1448 "regarding this file, please refer to the documentation under\n"
jpayne@69 1449 "the .delta output description.\n\n");
jpayne@69 1450 return;
jpayne@69 1451 }
jpayne@69 1452
jpayne@69 1453
jpayne@69 1454
jpayne@69 1455
jpayne@69 1456 void printUsage
jpayne@69 1457 (const char * s)
jpayne@69 1458
jpayne@69 1459 // Display the program's usage information to stderr.
jpayne@69 1460
jpayne@69 1461 {
jpayne@69 1462 fprintf (stderr,
jpayne@69 1463 "\nUSAGE: %s [options] <reference> <query> <pfx> < <input>\n\n", s);
jpayne@69 1464 fprintf (stderr, "Try '%s -h' for more information.\n", s);
jpayne@69 1465 return;
jpayne@69 1466 }
jpayne@69 1467
jpayne@69 1468
jpayne@69 1469
jpayne@69 1470
jpayne@69 1471 void validateData
jpayne@69 1472 (vector<Alignment> Alignments, vector<Cluster> Clusters,
jpayne@69 1473 const FastaRecord * Af, const FastaRecord * Bf)
jpayne@69 1474
jpayne@69 1475 // Self test function to check that the cluster and alignment information
jpayne@69 1476 // is valid
jpayne@69 1477
jpayne@69 1478 {
jpayne@69 1479 char * A, * B, * Brev = NULL;
jpayne@69 1480 vector<Cluster>::iterator Cp;
jpayne@69 1481 vector<Match>::iterator Mp;
jpayne@69 1482 vector<Alignment>::iterator Ap;
jpayne@69 1483 long int x, y, i;
jpayne@69 1484 char Xc, Yc;
jpayne@69 1485
jpayne@69 1486 A = Af->seq;
jpayne@69 1487
jpayne@69 1488 for ( Cp = Clusters.begin( ); Cp < Clusters.end( ); Cp ++ )
jpayne@69 1489 {
jpayne@69 1490 assert ( Cp->wasFused );
jpayne@69 1491
jpayne@69 1492 //-- Pick the right directional sequence for B
jpayne@69 1493 if ( Cp->dirB == FORWARD_CHAR )
jpayne@69 1494 B = Bf->seq;
jpayne@69 1495 else if ( Brev != NULL )
jpayne@69 1496 B = Brev;
jpayne@69 1497 else
jpayne@69 1498 {
jpayne@69 1499 Brev = (char *) Safe_malloc ( sizeof(char) * (Bf->len + 2) );
jpayne@69 1500 strcpy ( Brev + 1, Bf->seq + 1 );
jpayne@69 1501 Brev[0] = '\0';
jpayne@69 1502 Reverse_Complement (Brev, 1, Bf->len);
jpayne@69 1503 B = Brev;
jpayne@69 1504 }
jpayne@69 1505
jpayne@69 1506 for ( Mp = Cp->matches.begin( ); Mp < Cp->matches.end( ); Mp ++ )
jpayne@69 1507 {
jpayne@69 1508 //-- assert for each match in cluster, it is indeed a match
jpayne@69 1509 x = Mp->sA;
jpayne@69 1510 y = Mp->sB;
jpayne@69 1511 for ( i = 0; i < Mp->len; i ++ )
jpayne@69 1512 assert ( A[x ++] == B[y ++] );
jpayne@69 1513
jpayne@69 1514 //-- assert for each match in cluster, it is contained in an alignment
jpayne@69 1515 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
jpayne@69 1516 {
jpayne@69 1517 if ( Ap->sA <= Mp->sA && Ap->sB <= Mp->sB &&
jpayne@69 1518 Ap->eA >= Mp->sA + Mp->len - 1 &&
jpayne@69 1519 Ap->eB >= Mp->sB + Mp->len - 1 )
jpayne@69 1520 break;
jpayne@69 1521 }
jpayne@69 1522 assert ( Ap < Alignments.end( ) );
jpayne@69 1523 }
jpayne@69 1524 }
jpayne@69 1525
jpayne@69 1526 //-- assert alignments are optimal (quick check if first and last chars equal)
jpayne@69 1527 for ( Ap = Alignments.begin( ); Ap < Alignments.end( ); Ap ++ )
jpayne@69 1528 {
jpayne@69 1529 if ( Ap->dirB == REVERSE_CHAR )
jpayne@69 1530 {
jpayne@69 1531 assert (Brev != NULL);
jpayne@69 1532 B = Brev;
jpayne@69 1533 }
jpayne@69 1534 else
jpayne@69 1535 B = Bf->seq;
jpayne@69 1536
jpayne@69 1537 assert ( Ap->sA <= Ap->eA );
jpayne@69 1538 assert ( Ap->sB <= Ap->eB );
jpayne@69 1539
jpayne@69 1540 assert ( Ap->sA >= 1 && Ap->sA <= Af->len );
jpayne@69 1541 assert ( Ap->eA >= 1 && Ap->eA <= Af->len );
jpayne@69 1542 assert ( Ap->sB >= 1 && Ap->sB <= Bf->len );
jpayne@69 1543 assert ( Ap->eB >= 1 && Ap->eB <= Bf->len );
jpayne@69 1544
jpayne@69 1545 Xc = toupper(isalpha(A[Ap->sA]) ? A[Ap->sA] : STOP_CHAR);
jpayne@69 1546 Yc = toupper(isalpha(B[Ap->sB]) ? B[Ap->sB] : STOP_CHAR);
jpayne@69 1547 assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] );
jpayne@69 1548
jpayne@69 1549 Xc = toupper(isalpha(A[Ap->eA]) ? A[Ap->eA] : STOP_CHAR);
jpayne@69 1550 Yc = toupper(isalpha(B[Ap->eB]) ? B[Ap->eB] : STOP_CHAR);
jpayne@69 1551 assert ( 0 <= MATCH_SCORE [0] [Xc - 'A'] [Yc - 'A'] );
jpayne@69 1552 }
jpayne@69 1553
jpayne@69 1554 if ( Brev != NULL )
jpayne@69 1555 free (Brev);
jpayne@69 1556
jpayne@69 1557 return;
jpayne@69 1558 }