comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/aligner/SingleStateAlignerFlat2.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package aligner;
2
3 import dna.AminoAcid;
4 import shared.KillSwitch;
5 import shared.Tools;
6
7 /**
8 * Based on SSAFlat, but with previous state pointers removed. */
9 public final class SingleStateAlignerFlat2 implements Aligner {
10
11
12 public SingleStateAlignerFlat2(){}
13
14 private void prefillTopRow(){
15 final int[] header=packed[0];
16 final int qlen=rows;
17 for(int i=0; i<=columns; i++){
18 int x=columns-i+1;
19 int qbases=qlen-x;
20
21 //Minimal points to prefer a leftmost alignment
22 header[i]=qbases<=0 ? 0 : -qbases;
23
24 //Forces consumption of query, but does not allow for insertions...
25 // header[i]=qbases<=0 ? 0 : calcDelScoreOffset(qbases);
26 }
27 }
28
29 private void prefillLeftColumnStartingAt(int i){
30 packed[0][0]=MODE_MATCH;
31 i=Tools.max(1, i);
32 for(int score=MODE_INS+(POINTS_INS*i); i<=maxRows; i++){//Fill column 0 with insertions
33 score+=POINTS_INS;
34 packed[i][0]=score;
35 }
36 }
37
38 private void initialize(int rows_, int columns_){
39 rows=rows_;
40 columns=columns_;
41 if(rows<=maxRows && columns<=maxColumns){
42 prefillTopRow();
43 // prefillLeftColumn();
44 return;
45 }
46
47 final int maxRows0=maxRows;
48 final int maxColumns0=maxColumns;
49 final int[][] packed0=packed;
50
51 //Monotonic increase
52 maxRows=Tools.max(maxRows, rows+10);
53 maxColumns=Tools.max(maxColumns, columns+10);
54
55 if(packed==null || maxColumns>maxColumns0){//Make a new matrix
56 packed=KillSwitch.allocInt2D(maxRows+1, maxColumns+1);
57 prefillLeftColumnStartingAt(1);
58 }else{//Copy old rows
59 assert(maxRows0>0 && maxColumns0>0);
60 assert(maxRows>maxRows0 && maxColumns<=maxColumns0) : "rows="+rows+",maxRows="+maxRows+
61 ",maxRows0="+maxRows0+",columns="+columns+",maxColumns="+maxColumns+",maxColumns0="+maxColumns0;
62 packed=KillSwitch.allocInt2D(maxRows+1);
63 for(int i=0; i<packed.length; i++){
64 if(i<packed0.length){
65 packed[i]=packed0[i];
66 }else{
67 packed[i]=KillSwitch.allocInt1D(maxColumns+1);
68 }
69 }
70 //Fill column 0 with insertions
71 prefillLeftColumnStartingAt(maxRows0);
72 }
73 prefillTopRow();
74 }
75
76 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
77 * Will not fill areas that cannot match minScore */
78 @Override
79 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
80 return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore);
81 }
82
83 @Override
84 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){
85 return fillUnlimited(read, ref, refStartLoc, refEndLoc, -999999);
86 }
87
88 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart};
89 * Min score is optional */
90 @Override
91 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
92 initialize(read.length, refEndLoc-refStartLoc+1);
93
94 //temporary, for finding a bug
95 if(rows>maxRows || columns>maxColumns){
96 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n");
97 }
98
99 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows;
100 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns;
101
102 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc;
103 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length;
104
105 final int refOffset=refStartLoc-1;
106 for(int row=1; row<=rows; row++){
107
108 final byte qBase=read[row-1];
109 for(int col=1; col<=columns; col++){
110
111 final byte rBase=ref[refOffset+col];
112
113 final boolean match=(qBase==rBase);
114 final boolean defined=(qBase!='N' && rBase!='N');
115
116 final int scoreFromDiag=packed[row-1][col-1];
117 final int scoreFromDel=packed[row][col-1];
118 final int scoreFromIns=packed[row-1][col];
119
120 final int diagScoreM=POINTS_MATCH;
121 final int diagScoreS=POINTS_SUB;
122 final int delScore=scoreFromDel+POINTS_DEL;
123 final int insScore=scoreFromIns+POINTS_INS;
124
125 // final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF);
126 int diagScore=(match ? diagScoreM : diagScoreS);
127 diagScore=scoreFromDiag+(defined ? diagScore : POINTS_NOREF);
128
129 int score=diagScore>=delScore ? diagScore : delScore;
130 score=score>=insScore ? score : insScore;
131
132 packed[row][col]=score;
133 }
134 //iterationsUnlimited+=columns;
135 }
136
137
138 int maxCol=-1;
139 int maxState=-1;
140 int maxStart=-1;
141 int maxScore=Integer.MIN_VALUE;
142
143 for(int col=1; col<=columns; col++){
144 int x=packed[rows][col];
145 if(x>maxScore){
146 maxScore=x;
147 maxCol=col;
148
149 // assert(rows-1<read.length) : (rows-1)+", "+read.length;
150 // assert(refOffset+col<ref.length) : refOffset+", "+col+", "+ref.length;
151 maxState=getState(rows, col, read[rows-1], ref[refOffset+col]);
152 maxStart=x;
153 }
154 }
155
156 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore);
157 return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, maxScore, maxStart};
158 }
159
160 int getState(int row, int col, byte q, byte r){//zxvzxcv TODO: Fix - needs to find max
161 final boolean match=(q==r);
162 final boolean defined=(q!='N' && r!='N');
163
164 final int scoreFromDiag=packed[row-1][col-1];
165 final int scoreFromDel=packed[row][col-1];
166 final int scoreFromIns=packed[row-1][col];
167 // final int score=packed[row][col];
168
169 final int diagScoreM=POINTS_MATCH;
170 final int diagScoreS=POINTS_SUB;
171 final int delScore=scoreFromDel+POINTS_DEL;
172 final int insScore=scoreFromIns+POINTS_INS;
173
174 final int diagScore=scoreFromDiag+(defined ? (match ? diagScoreM : diagScoreS) : POINTS_NOREF);
175
176 // int score2=diagScore>=delScore ? diagScore : delScore;
177 // score2=score>=insScore ? score : insScore;
178
179 // assert(score==score2) : score+", "+score2;
180
181 if(diagScore>=delScore && diagScore>=insScore){
182 return defined ? match ? MODE_MATCH : MODE_SUB : MODE_N;
183 }else if(delScore>=insScore){
184 return MODE_DEL;
185 }
186 return MODE_INS;
187 }
188
189
190 /** Generates the match string.
191 * State is NOT used. */
192 @Override
193 public final byte[] traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){
194 // assert(false);
195 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
196 assert(row==rows);
197
198 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1".
199 int outPos=0;
200
201 // assert(state==(packed[row][col]&MODEMASK));
202
203 while(row>0 && col>0){
204 byte q=query[row-1];
205 byte r=ref[refStartLoc+col-1];
206 boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r));
207 state=getState(row, col, q, r);
208 if(state==MODE_MATCH){
209 col--;
210 row--;
211 out[outPos]=defined ? (byte)'m' : (byte)'N';
212 }else if(state==MODE_SUB){
213 col--;
214 row--;
215 out[outPos]=defined ? (byte)'S' : (byte)'N';
216 }else if(state==MODE_N){
217 col--;
218 row--;
219 out[outPos]='N';
220 }else if(state==MODE_DEL){
221 col--;
222 out[outPos]='D';
223 }else if(state==MODE_INS){
224 row--;
225 // out[outPos]='I';
226 // out[outPos]=(col<0 || col>=columns ? (byte)'C' : (byte)'I');
227 if(col>=0 && col<columns){
228 out[outPos]='I';
229 }else{
230 out[outPos]='C';
231 col--;
232 }
233 }else{
234 assert(false) : state;
235 }
236 outPos++;
237 }
238
239 assert(row==0 || col==0);
240 if(col!=row){
241 while(row>0){
242 out[outPos]='C';
243 outPos++;
244 row--;
245 col--;
246 }
247 if(col>0){
248 //do nothing
249 }
250 }
251
252 //Shrink and reverse the string
253 byte[] out2=new byte[outPos];
254 for(int i=0; i<outPos; i++){
255 out2[i]=out[outPos-i-1];
256 }
257 out=null;
258
259 return out2;
260 }
261
262 @Override
263 /** Generates identity;
264 * fills 'extra' with {match, sub, del, ins, N, clip} if present */
265 public float tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){
266
267 // assert(false);
268 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc;
269 assert(row==rows);
270
271 // assert(state==(packed[row][col]&MODEMASK));
272 int match=0, sub=0, del=0, ins=0, noref=0, clip=0;
273
274 while(row>0 && col>0){
275 byte q=query[row-1];
276 byte r=ref[refStartLoc+col-1];
277 boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r));
278 state=getState(row, col, q, r);
279 if(state==MODE_MATCH){
280 col--;
281 row--;
282 match+=(defined ? 1 : 0);
283 noref+=(defined ? 0 : 1);
284 }else if(state==MODE_SUB){
285 col--;
286 row--;
287 sub+=(defined ? 1 : 0);
288 noref+=(defined ? 0 : 1);
289 }else if(state==MODE_N){
290 col--;
291 row--;
292 noref++;
293 }else if(state==MODE_DEL){
294 col--;
295 del++;
296 }else if(state==MODE_INS){
297 row--;
298 boolean edge=(col<=1 || col>=columns);
299 ins+=(edge ? 0 : 1);
300 clip+=(edge ? 1 : 0);
301 }else{
302 assert(false) : state;
303 }
304 }
305
306 assert(row==0 || col==0);
307 if(col!=row){//Not sure what this is doing
308 while(row>0){
309 clip++;
310 row--;
311 col--;
312 }
313 if(col>0){
314 //do nothing
315 }
316 }
317
318 if(extra!=null){
319 assert(extra.length==5);
320 extra[0]=match;
321 extra[1]=sub;
322 extra[2]=del;
323 extra[3]=ins;
324 extra[4]=noref;
325 extra[5]=clip;
326 }
327
328 float len=match+sub+ins+del+noref*0.1f;
329 float id=match/Tools.max(1.0f, len);
330 return id;
331 }
332
333 /** @return {score, bestRefStart, bestRefStop} */
334 @Override
335 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc,
336 final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){
337
338 int row=maxRow;
339 int col=maxCol;
340 int state=maxState;
341
342 assert(maxState>=0 && maxState<packed.length) :
343 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
344 assert(maxRow>=0 && maxRow<packed.length) :
345 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
346 assert(maxCol>=0 && maxCol<packed[0].length) :
347 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc);
348
349 int score=packed[maxRow][maxCol]; //Or zero, if it is to be recalculated
350
351 if(row<rows){
352 int difR=rows-row;
353 int difC=columns-col;
354
355 while(difR>difC){
356 score+=POINTS_NOREF;
357 difR--;
358 }
359
360 row+=difR;
361 col+=difR;
362
363 }
364
365 assert(refStartLoc<=refEndLoc);
366 assert(row==rows);
367
368
369 final int bestRefStop=refStartLoc+col-1;
370
371 while(row>0 && col>0){
372 final byte q=read[row-1];
373 final byte r=ref[refStartLoc+col-1];
374 // final boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r));
375 state=getState(row, col, q, r);
376 if(state==MODE_MATCH){
377 col--;
378 row--;
379 }else if(state==MODE_SUB){
380 col--;
381 row--;
382 }else if(state==MODE_N){
383 col--;
384 row--;
385 }else if(state==MODE_DEL){
386 col--;
387 }else if(state==MODE_INS){
388 row--;
389 }else{
390 assert(false) : state;
391 }
392 }
393 // assert(false) : row+", "+col;
394 if(row>col){
395 col-=row;
396 }
397
398 final int bestRefStart=refStartLoc+col;
399
400 // System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart);
401 int[] rvec;
402 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow
403 int padLeft=Tools.max(0, refStartLoc-bestRefStart);
404 int padRight=Tools.max(0, bestRefStop-refEndLoc);
405 rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight};
406 }else{
407 rvec=new int[] {score, bestRefStart, bestRefStop};
408 }
409 return rvec;
410 }
411
412
413 /** Will not fill areas that cannot match minScore.
414 * @return {score, bestRefStart, bestRefStop} */
415 @Override
416 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){
417 int a=Tools.max(0, refStartLoc);
418 int b=Tools.min(ref.length-1, refEndLoc);
419 assert(b>=a);
420
421 if(b-a>=maxColumns){
422 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns);
423 assert(false) : refStartLoc+", "+refEndLoc;
424 b=Tools.min(ref.length-1, a+maxColumns-1);
425 }
426 int[] max=fillLimited(read, ref, a, b, minScore);
427 // return max==null ? null : new int[] {max[3], 0, max[1]};
428
429 int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/));
430
431 return score;
432 }
433
434 public static final String toString(byte[] ref, int startLoc, int stopLoc){
435 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1);
436 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);}
437 return sb.toString();
438 }
439
440 // public static int calcDelScore(int len){
441 // if(len<=0){return 0;}
442 // int score=POINTS_DEL;
443 // if(len>1){
444 // score+=(len-1)*POINTS_DEL2;
445 // }
446 // return score;
447 // }
448
449 // public int maxScoreByIdentity(int len, float identity){
450 // assert(identity>=0 && identity<=1);
451 // return (int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
452 // }
453
454 @Override
455 public int minScoreByIdentity(int len, float identity){
456 assert(identity>=0 && identity<=1);
457
458 int a=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB));
459 int b=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_INS));
460 int c=(int)(len*(1*POINTS_MATCH+((1/(Tools.max(identity, 0.000001f)))-1)*POINTS_DEL));
461 return Tools.min(a, b, c);
462 }
463
464 private static int calcDelScore(int len){
465 if(len<=0){return 0;}
466 int score=POINTS_DEL*len;
467 return score;
468 }
469 //
470 // public static int calcInsScore(int len){
471 // if(len<=0){return 0;}
472 // int score=POINTS_INS;
473 //
474 // if(len>1){
475 // score+=(len-1)*POINTS_INS2;
476 // }
477 // return score;
478 // }
479 //
480 // private static int calcInsScoreOffset(int len){
481 // if(len<=0){return 0;}
482 // int score=POINTS_INS;
483 //
484 // if(len>1){
485 // score+=(len-1)*POINTS_INS2;
486 // }
487 // return score;
488 // }
489
490 @Override
491 public int rows(){return rows;}
492 @Override
493 public int columns(){return columns;}
494
495
496 private int maxRows;
497 private int maxColumns;
498
499 private int[][] packed;
500
501 public static final int MAX_SCORE=Integer.MAX_VALUE-2000;
502 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD".
503
504 //For some reason changing MODE_DEL from 1 to 0 breaks everything
505 private static final byte MODE_DEL=1;
506 private static final byte MODE_INS=2;
507 private static final byte MODE_SUB=3;
508 private static final byte MODE_MATCH=4;
509 private static final byte MODE_N=5;
510
511 public static final int POINTS_NOREF=-20;
512 public static final int POINTS_MATCH=100;
513 public static final int POINTS_SUB=-50;
514 public static final int POINTS_INS=-121;
515 public static final int POINTS_DEL=-111;
516
517 // public static final int POINTS_NOREF=-150000;
518 // public static final int POINTS_MATCH=100;
519 // public static final int POINTS_SUB=-100;
520 // public static final int POINTS_INS=-100;
521 // public static final int POINTS_DEL=-100;
522
523 public static final int BAD=MIN_SCORE-1;
524
525 private int rows;
526 private int columns;
527
528 // public long iterationsLimited=0;
529 // public long iterationsUnlimited=0;
530
531 public boolean verbose=false;
532 public boolean verbose2=false;
533
534 }