Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/aligner/SingleStateAlignerFlat.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package aligner; | |
2 | |
3 import dna.AminoAcid; | |
4 import shared.KillSwitch; | |
5 import shared.Tools; | |
6 | |
7 /** | |
8 * Based on MSA9PBA, but reduced to a single matrix. */ | |
9 public final class SingleStateAlignerFlat implements Aligner { | |
10 | |
11 | |
12 public SingleStateAlignerFlat(){} | |
13 | |
14 private void prefillTopRow(){ | |
15 final int[] header=packed[0]; | |
16 final int qlen=rows; | |
17 for(int i=0; i<=columns; i++){ | |
18 int x=columns-i+1; | |
19 int qbases=qlen-x; | |
20 | |
21 //Minimal points to prefer a leftmost alignment | |
22 header[i]=qbases<=0 ? 0 : ((-qbases)<<STARTOFFSET); | |
23 | |
24 //Forces consumption of query, but does not allow for insertions... | |
25 // header[i]=qbases<=0 ? 0 : calcDelScoreOffset(qbases); | |
26 } | |
27 } | |
28 | |
29 private void prefillLeftColumnStartingAt(int i){ | |
30 packed[0][0]=MODE_MATCH; | |
31 i=Tools.max(1, i); | |
32 for(int score=MODE_INS+(POINTSoff_INS*i); i<=maxRows; i++){//Fill column 0 with insertions | |
33 score+=POINTSoff_INS; | |
34 packed[i][0]=score; | |
35 } | |
36 } | |
37 | |
38 // private void clearInner(){//NotmaxRows needed | |
39 // for(int i=1; i<=maxRows; i++){ | |
40 // for(int j=1; j<=maxColumns; j++){ | |
41 // packed[i][j]=0; | |
42 // } | |
43 // } | |
44 // } | |
45 // | |
46 // private void clearAll(){//NotmaxRows needed | |
47 // for(int i=0; i<=maxRows; i++){ | |
48 // for(int j=0; j<=maxColumns; j++){ | |
49 // packed[i][j]=0; | |
50 // } | |
51 // } | |
52 // } | |
53 | |
54 private void initialize(int rows_, int columns_){ | |
55 rows=rows_; | |
56 columns=columns_; | |
57 if(rows<=maxRows && columns<=maxColumns){ | |
58 prefillTopRow(); | |
59 // prefillLeftColumn(); | |
60 return; | |
61 } | |
62 | |
63 final int maxRows0=maxRows; | |
64 final int maxColumns0=maxColumns; | |
65 final int[][] packed0=packed; | |
66 | |
67 //Monotonic increase | |
68 maxRows=Tools.max(maxRows, rows+10); | |
69 maxColumns=Tools.max(maxColumns, columns+10); | |
70 | |
71 if(packed==null || maxColumns>maxColumns0){//Make a new matrix | |
72 packed=KillSwitch.allocInt2D(maxRows+1, maxColumns+1); | |
73 prefillLeftColumnStartingAt(1); | |
74 }else{//Copy old rows | |
75 assert(maxRows0>0 && maxColumns0>0); | |
76 assert(maxRows>maxRows0 && maxColumns<=maxColumns0); | |
77 packed=KillSwitch.allocInt2D(maxRows+1); | |
78 for(int i=0; i<packed.length; i++){ | |
79 if(i<packed0.length){ | |
80 packed[i]=packed0[i]; | |
81 }else{ | |
82 packed[i]=KillSwitch.allocInt1D(maxColumns+1); | |
83 } | |
84 } | |
85 //Fill column 0 with insertions | |
86 prefillLeftColumnStartingAt(maxRows0); | |
87 } | |
88 prefillTopRow(); | |
89 } | |
90 | |
91 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart}; | |
92 * Will not fill areas that cannot match minScore */ | |
93 @Override | |
94 public final int[] fillLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ | |
95 return fillUnlimited(read, ref, refStartLoc, refEndLoc, minScore); | |
96 } | |
97 | |
98 @Override | |
99 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ | |
100 return fillUnlimited(read, ref, refStartLoc, refEndLoc, -999999); | |
101 } | |
102 | |
103 /** return new int[] {rows, maxCol, maxState, maxScore, maxStart}; | |
104 * Min score is optional */ | |
105 @Override | |
106 public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ | |
107 initialize(read.length, refEndLoc-refStartLoc+1); | |
108 | |
109 //temporary, for finding a bug | |
110 if(rows>maxRows || columns>maxColumns){ | |
111 throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); | |
112 } | |
113 | |
114 assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; | |
115 assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; | |
116 | |
117 assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; | |
118 assert(refEndLoc<ref.length) : "Check that values are in-bounds before calling this function: "+refEndLoc+", "+ref.length; | |
119 | |
120 final int refOffset=refStartLoc-1; | |
121 for(int row=1; row<=rows; row++){ | |
122 | |
123 final byte qBase=read[row-1]; | |
124 for(int col=1; col<=columns; col++){ | |
125 | |
126 final byte rBase=ref[refOffset+col]; | |
127 | |
128 final boolean match=(qBase==rBase); | |
129 final boolean defined=(qBase!='N' && rBase!='N'); | |
130 | |
131 final int valueFromDiag=packed[row-1][col-1]; | |
132 final int valueFromDel=packed[row][col-1]; | |
133 final int valueFromIns=packed[row-1][col]; | |
134 // final int stateFromDiag=valueFromDiag&MODEMASK; | |
135 final int scoreFromDiag=valueFromDiag&HIGHMASK; | |
136 // final int stateFromDel=valueFromDel&MODEMASK; | |
137 final int scoreFromDel=valueFromDel&HIGHMASK; | |
138 // final int stateFromIns=valueFromIns&MODEMASK; | |
139 final int scoreFromIns=valueFromIns&HIGHMASK; | |
140 | |
141 // final boolean prevMatch=stateFromDiag==MODE_MATCH; | |
142 // final boolean prevSub=stateFromDiag==MODE_SUB; | |
143 // final boolean prevDel=stateFromDel==MODE_DEL; | |
144 // final boolean prevIns=stateFromIns==MODE_INS; | |
145 | |
146 //Old conditional code, replaced by faster arrays | |
147 // final int diagScoreM=MODE_MATCH|(prevMatch ? POINTSoff_MATCH2 : POINTSoff_MATCH); | |
148 // final int diagScoreS=MODE_SUB|(prevSub ? POINTSoff_SUB2 : POINTSoff_SUB); | |
149 // final int delScore=scoreFromDel+(prevDel ? POINTSoff_DEL2 : POINTSoff_DEL)|MODE_DEL; | |
150 // final int insScore=scoreFromIns+(prevIns ? POINTSoff_INS2 : POINTSoff_INS)|MODE_INS; | |
151 | |
152 final int diagScoreM=POINTSoff_MATCH|MODE_MATCH; | |
153 final int diagScoreS=POINTSoff_SUB|MODE_SUB; | |
154 final int delScore=scoreFromDel+POINTSoff_DEL|MODE_DEL; | |
155 final int insScore=scoreFromIns+POINTSoff_INS|MODE_INS; | |
156 | |
157 int diagScore=(match ? diagScoreM : diagScoreS); | |
158 diagScore=scoreFromDiag+(defined ? diagScore : POINTSoff_NOREF_MODE_SUB); | |
159 | |
160 int score=diagScore>=delScore ? diagScore : delScore; | |
161 score=score>=insScore ? score : insScore; | |
162 | |
163 packed[row][col]=score; | |
164 } | |
165 // iterationsUnlimited+=columns; | |
166 } | |
167 | |
168 | |
169 int maxCol=-1; | |
170 int maxState=-1; | |
171 int maxStart=-1; | |
172 int maxScore=Integer.MIN_VALUE; | |
173 | |
174 for(int col=1; col<=columns; col++){ | |
175 int x=packed[rows][col]; | |
176 if((x&SCOREMASK)>maxScore){ | |
177 maxScore=x; | |
178 maxCol=col; | |
179 maxState=x&MODEMASK; | |
180 maxStart=x&STARTMASK; | |
181 } | |
182 } | |
183 maxScore>>=SCOREOFFSET; | |
184 | |
185 // System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore+"; minScore="+minScore); | |
186 return maxScore<minScore ? null : new int[] {rows, maxCol, maxState, maxScore, maxStart}; | |
187 } | |
188 | |
189 /** Generates the match string */ | |
190 @Override | |
191 public final byte[] traceback(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ | |
192 // assert(false); | |
193 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; | |
194 assert(row==rows); | |
195 | |
196 byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". | |
197 int outPos=0; | |
198 | |
199 // assert(state==(packed[row][col]&MODEMASK)); | |
200 | |
201 while(row>0 && col>0){ | |
202 byte q=query[row-1]; | |
203 byte r=ref[refStartLoc+col-1]; | |
204 boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r)); | |
205 state=(packed[row][col]&MODEMASK); | |
206 if(state==MODE_MATCH){ | |
207 col--; | |
208 row--; | |
209 out[outPos]=defined ? (byte)'m' : (byte)'N'; | |
210 }else if(state==MODE_SUB){ | |
211 col--; | |
212 row--; | |
213 out[outPos]=defined ? (byte)'S' : (byte)'N'; | |
214 }else if(state==MODE_N){ | |
215 col--; | |
216 row--; | |
217 out[outPos]='N'; | |
218 }else if(state==MODE_DEL){ | |
219 col--; | |
220 out[outPos]='D'; | |
221 }else if(state==MODE_INS){ | |
222 row--; | |
223 // out[outPos]='I'; | |
224 if(col>=0 && col<columns){ | |
225 out[outPos]='I'; | |
226 }else{ | |
227 out[outPos]='C'; | |
228 col--; | |
229 } | |
230 }else{ | |
231 assert(false) : state; | |
232 } | |
233 outPos++; | |
234 } | |
235 | |
236 assert(row==0 || col==0); | |
237 if(col!=row){//Not sure what this is doing | |
238 while(row>0){ | |
239 out[outPos]='C'; | |
240 outPos++; | |
241 row--; | |
242 col--; | |
243 } | |
244 if(col>0){ | |
245 //do nothing | |
246 } | |
247 } | |
248 | |
249 //Shrink and reverse the string | |
250 byte[] out2=new byte[outPos]; | |
251 for(int i=0; i<outPos; i++){ | |
252 out2[i]=out[outPos-i-1]; | |
253 } | |
254 out=null; | |
255 | |
256 return out2; | |
257 } | |
258 | |
259 @Override | |
260 /** Generates identity; | |
261 * fills 'extra' with {match, sub, del, ins, N, clip} if present */ | |
262 public float tracebackIdentity(byte[] query, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, int[] extra){ | |
263 | |
264 // assert(false); | |
265 assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; | |
266 assert(row==rows); | |
267 | |
268 // assert(state==(packed[row][col]&MODEMASK)); | |
269 int match=0, sub=0, del=0, ins=0, noref=0, clip=0; | |
270 | |
271 while(row>0 && col>0){ | |
272 byte q=query[row-1]; | |
273 byte r=ref[refStartLoc+col-1]; | |
274 boolean defined=(AminoAcid.isFullyDefined(q) && AminoAcid.isFullyDefined(r)); | |
275 state=(packed[row][col]&MODEMASK); | |
276 if(state==MODE_MATCH){ | |
277 col--; | |
278 row--; | |
279 match+=(defined ? 1 : 0); | |
280 noref+=(defined ? 0 : 1); | |
281 }else if(state==MODE_SUB){ | |
282 col--; | |
283 row--; | |
284 sub+=(defined ? 1 : 0); | |
285 noref+=(defined ? 0 : 1); | |
286 }else if(state==MODE_N){//Can't currently happen | |
287 col--; | |
288 row--; | |
289 noref++; | |
290 }else if(state==MODE_DEL){ | |
291 col--; | |
292 del++; | |
293 }else if(state==MODE_INS){ | |
294 row--; | |
295 boolean edge=(col<=1 || col>=columns); | |
296 ins+=(edge ? 0 : 1); | |
297 clip+=(edge ? 1 : 0); | |
298 }else{ | |
299 assert(false) : state; | |
300 } | |
301 } | |
302 | |
303 assert(row==0 || col==0); | |
304 if(col!=row){//Not sure what this is doing | |
305 while(row>0){ | |
306 clip++; | |
307 row--; | |
308 col--; | |
309 } | |
310 if(col>0){ | |
311 //do nothing | |
312 } | |
313 } | |
314 | |
315 if(extra!=null){ | |
316 assert(extra.length==5); | |
317 extra[0]=match; | |
318 extra[1]=sub; | |
319 extra[2]=del; | |
320 extra[3]=ins; | |
321 extra[4]=noref; | |
322 extra[5]=clip; | |
323 } | |
324 | |
325 float len=match+sub+ins+del+noref*0.1f; | |
326 float id=match/Tools.max(1.0f, len); | |
327 return id; | |
328 } | |
329 | |
330 /** @return {score, bestRefStart, bestRefStop} */ | |
331 @Override | |
332 public final int[] score(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, | |
333 final int maxRow, final int maxCol, final int maxState/*, final int maxScore, final int maxStart*/){ | |
334 | |
335 int row=maxRow; | |
336 int col=maxCol; | |
337 int state=maxState; | |
338 | |
339 assert(maxState>=0 && maxState<packed.length) : | |
340 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); | |
341 assert(maxRow>=0 && maxRow<packed.length) : | |
342 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); | |
343 assert(maxCol>=0 && maxCol<packed[0].length) : | |
344 maxState+", "+maxRow+", "+maxCol+"\n"+new String(read)+"\n"+toString(ref, refStartLoc, refEndLoc); | |
345 | |
346 int score=packed[maxRow][maxCol]&SCOREMASK; //Or zero, if it is to be recalculated | |
347 | |
348 if(row<rows){ | |
349 int difR=rows-row; | |
350 int difC=columns-col; | |
351 | |
352 while(difR>difC){ | |
353 score+=POINTSoff_NOREF; | |
354 difR--; | |
355 } | |
356 | |
357 row+=difR; | |
358 col+=difR; | |
359 | |
360 } | |
361 | |
362 assert(refStartLoc<=refEndLoc); | |
363 assert(row==rows); | |
364 | |
365 | |
366 final int bestRefStop=refStartLoc+col-1; | |
367 | |
368 while(row>0 && col>0){ | |
369 // final byte c=read[row-1]; | |
370 // final byte r=ref[refStartLoc+col-1]; | |
371 // final boolean defined=(AminoAcid.isFullyDefined(c) && AminoAcid.isFullyDefined(r)); | |
372 state=(packed[row][col]&MODEMASK); | |
373 if(state==MODE_MATCH){ | |
374 col--; | |
375 row--; | |
376 }else if(state==MODE_SUB){ | |
377 col--; | |
378 row--; | |
379 }else if(state==MODE_N){ | |
380 col--; | |
381 row--; | |
382 }else if(state==MODE_DEL){ | |
383 col--; | |
384 }else if(state==MODE_INS){ | |
385 row--; | |
386 }else{ | |
387 assert(false) : state; | |
388 } | |
389 } | |
390 // assert(false) : row+", "+col; | |
391 if(row>col){ | |
392 col-=row; | |
393 } | |
394 | |
395 final int bestRefStart=refStartLoc+col; | |
396 | |
397 score>>=SCOREOFFSET; | |
398 // System.err.println("t2\t"+score+", "+maxScore+", "+maxStart+", "+bestRefStart); | |
399 int[] rvec; | |
400 if(bestRefStart<refStartLoc || bestRefStop>refEndLoc){ //Suggest extra padding in cases of overflow | |
401 int padLeft=Tools.max(0, refStartLoc-bestRefStart); | |
402 int padRight=Tools.max(0, bestRefStop-refEndLoc); | |
403 rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; | |
404 }else{ | |
405 rvec=new int[] {score, bestRefStart, bestRefStop}; | |
406 } | |
407 return rvec; | |
408 } | |
409 | |
410 | |
411 /** Will not fill areas that cannot match minScore. | |
412 * @return {score, bestRefStart, bestRefStop} */ | |
413 @Override | |
414 public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ | |
415 int a=Tools.max(0, refStartLoc); | |
416 int b=Tools.min(ref.length-1, refEndLoc); | |
417 assert(b>=a); | |
418 | |
419 if(b-a>=maxColumns){ | |
420 System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); | |
421 assert(false) : refStartLoc+", "+refEndLoc; | |
422 b=Tools.min(ref.length-1, a+maxColumns-1); | |
423 } | |
424 int[] max=fillLimited(read, ref, a, b, minScore); | |
425 // return max==null ? null : new int[] {max[3], 0, max[1]}; | |
426 | |
427 int[] score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2]/*, max[3], max[4]*/)); | |
428 | |
429 return score; | |
430 } | |
431 | |
432 public static final String toString(byte[] ref, int startLoc, int stopLoc){ | |
433 StringBuilder sb=new StringBuilder(stopLoc-startLoc+1); | |
434 for(int i=startLoc; i<=stopLoc; i++){sb.append((char)ref[i]);} | |
435 return sb.toString(); | |
436 } | |
437 | |
438 // public int maxScoreByIdentity(int len, float identity){ | |
439 // assert(identity>=0 && identity<=1); | |
440 // return (int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB)); | |
441 // } | |
442 | |
443 @Override | |
444 public int minScoreByIdentity(int len, float identity){ | |
445 assert(identity>=0 && identity<=1); | |
446 | |
447 int a=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_SUB)); | |
448 int b=(int)(len*(identity*POINTS_MATCH+(1-identity)*POINTS_INS)); | |
449 int c=(int)(len*(1*POINTS_MATCH+((1/(Tools.max(identity, 0.000001f)))-1)*POINTS_DEL)); | |
450 return Tools.min(a, b, c); | |
451 } | |
452 | |
453 private static int calcDelScoreOffset(int len){ | |
454 if(len<=0){return 0;} | |
455 int score=POINTSoff_DEL*len; | |
456 return score; | |
457 } | |
458 // | |
459 // public static int calcInsScore(int len){ | |
460 // if(len<=0){return 0;} | |
461 // int score=POINTS_INS; | |
462 // | |
463 // if(len>1){ | |
464 // score+=(len-1)*POINTS_INS2; | |
465 // } | |
466 // return score; | |
467 // } | |
468 // | |
469 // private static int calcInsScoreOffset(int len){ | |
470 // if(len<=0){return 0;} | |
471 // int score=POINTSoff_INS; | |
472 // | |
473 // if(len>1){ | |
474 // score+=(len-1)*POINTSoff_INS2; | |
475 // } | |
476 // return score; | |
477 // } | |
478 | |
479 @Override | |
480 public int rows(){return rows;} | |
481 @Override | |
482 public int columns(){return columns;} | |
483 | |
484 | |
485 private int maxRows; | |
486 private int maxColumns; | |
487 | |
488 private int[][] packed; | |
489 | |
490 public static final int MODEBITS=3; | |
491 public static final int STARTBITS=9; | |
492 public static final int LOWBITS=MODEBITS+STARTBITS; | |
493 public static final int SCOREBITS=32-STARTBITS; | |
494 public static final int MAX_START=((1<<STARTBITS)-1); | |
495 public static final int MAX_SCORE=((1<<(SCOREBITS-1))-1)-2000; | |
496 public static final int MIN_SCORE=0-MAX_SCORE; //Keeps it 1 point above "BAD". | |
497 | |
498 public static final int STARTOFFSET=MODEBITS; | |
499 public static final int SCOREOFFSET=LOWBITS; | |
500 | |
501 public static final int MODEMASK=~((-1)<<MODEBITS); | |
502 public static final int STARTMASK=(~((-1)<<STARTBITS))<<STARTOFFSET; | |
503 public static final int SCOREMASK=(~((-1)<<SCOREBITS))<<SCOREOFFSET; | |
504 public static final int HIGHMASK=SCOREMASK|STARTMASK; | |
505 | |
506 //For some reason changing MODE_DEL from 1 to 0 breaks everything | |
507 private static final byte MODE_DEL=1; | |
508 private static final byte MODE_INS=2; | |
509 private static final byte MODE_SUB=3; | |
510 private static final byte MODE_MATCH=4; | |
511 private static final byte MODE_N=5; | |
512 | |
513 public static final int POINTS_NOREF=-15; | |
514 public static final int POINTS_MATCH=100; | |
515 public static final int POINTS_SUB=-50; | |
516 public static final int POINTS_INS=-121; | |
517 public static final int POINTS_DEL=-111; | |
518 | |
519 public static final int BAD=MIN_SCORE-1; | |
520 | |
521 | |
522 public static final int POINTSoff_NOREF=(POINTS_NOREF<<SCOREOFFSET); | |
523 public static final int POINTSoff_NOREF_MODE_SUB=POINTSoff_NOREF|MODE_SUB; | |
524 public static final int POINTSoff_MATCH=(POINTS_MATCH<<SCOREOFFSET); | |
525 public static final int POINTSoff_SUB=(POINTS_SUB<<SCOREOFFSET); | |
526 public static final int POINTSoff_INS=(POINTS_INS<<SCOREOFFSET); | |
527 public static final int POINTSoff_DEL=(POINTS_DEL<<SCOREOFFSET); | |
528 public static final int BADoff=(BAD<<SCOREOFFSET); | |
529 | |
530 private int rows; | |
531 private int columns; | |
532 | |
533 // public long iterationsLimited=0; | |
534 // public long iterationsUnlimited=0; | |
535 | |
536 public boolean verbose=false; | |
537 public boolean verbose2=false; | |
538 | |
539 } |