Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/SketchObject.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package sketch; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.Arrays; | |
6 import java.util.Collections; | |
7 import java.util.Random; | |
8 | |
9 import aligner.SingleStateAlignerFlat2; | |
10 import aligner.SingleStateAlignerFlatFloat; | |
11 import dna.AminoAcid; | |
12 import dna.Data; | |
13 import prok.GeneCaller; | |
14 import prok.GeneModel; | |
15 import prok.GeneModelParser; | |
16 import prok.ProkObject; | |
17 import shared.Parse; | |
18 import shared.Shared; | |
19 import shared.Timer; | |
20 import shared.Tools; | |
21 import stream.Read; | |
22 import structures.EntropyTracker; | |
23 import structures.IntList3; | |
24 import tax.TaxTree; | |
25 | |
26 public class SketchObject { | |
27 | |
28 /*--------------------------------------------------------------*/ | |
29 /*---------------- Parsing ----------------*/ | |
30 /*--------------------------------------------------------------*/ | |
31 | |
32 public static void main(String[] args){ | |
33 Timer t=new Timer(); | |
34 for(int i=0; i<1000000; i++){ | |
35 wkidToAniExact(0.5, 32, 23, 0.000005); | |
36 } | |
37 t.stop(); | |
38 System.out.println(t); | |
39 } | |
40 | |
41 public static boolean parseSketchFlags(String arg, String a, String b){ | |
42 | |
43 if(parseCoding(arg, a, b)){ | |
44 // | |
45 } | |
46 | |
47 else if(a.equalsIgnoreCase("k")){ | |
48 if(b.indexOf(',')>=0){ | |
49 String[] split=b.split(","); | |
50 assert(split.length==2) : "Bad argument "+arg; | |
51 int x=Integer.parseInt(split[0]); | |
52 int y=Integer.parseInt(split[1]); | |
53 k=Tools.max(x, y); | |
54 k2=Tools.min(x, y); | |
55 if(k==k2){k2=0;} | |
56 }else{ | |
57 k=Integer.parseInt(b); | |
58 k2=0; | |
59 } | |
60 setK=true; | |
61 }else if(a.equalsIgnoreCase("rcomp")){ | |
62 rcomp=Parse.parseBoolean(b); | |
63 }else if(a.equalsIgnoreCase("minfakeid")){ | |
64 minFakeID=Integer.parseInt(b); | |
65 }else if(a.equalsIgnoreCase("hashNames")){ | |
66 hashNames=Parse.parseBoolean(b); | |
67 }else if(a.equalsIgnoreCase("preferSSUMap")){ | |
68 preferSSUMap=Parse.parseBoolean(b); | |
69 }else if(a.equalsIgnoreCase("preferSSUMapForEuks") || a.equalsIgnoreCase("preferSSUMapEuks")){ | |
70 preferSSUMapForEuks=Parse.parseBoolean(b); | |
71 }else if(a.equalsIgnoreCase("useSSUMapOnly")){ | |
72 useSSUMapOnly=Parse.parseBoolean(b); | |
73 }else if(a.equalsIgnoreCase("useSSUMapOnlyForEuks") || a.equalsIgnoreCase("SSUMapOnlyEuks")){ | |
74 useSSUMapOnlyForEuks=Parse.parseBoolean(b); | |
75 }else if(a.equalsIgnoreCase("skipNonCanonical")){ | |
76 skipNonCanonical=Parse.parseBoolean(b); | |
77 }else if(a.equalsIgnoreCase("useSizeEstimateInScore") || a.equalsIgnoreCase("useSizeEstimate")){ | |
78 useSizeEstimate=Parse.parseBoolean(b); | |
79 }else if(a.equalsIgnoreCase("blacklist") || a.equalsIgnoreCase("bl")){ | |
80 blacklist=b; | |
81 }else if(a.equalsIgnoreCase("whitelist") || a.equalsIgnoreCase("usewhitelist")){ | |
82 useWhitelist=Parse.parseBoolean(b); | |
83 } | |
84 | |
85 else if(a.equalsIgnoreCase("sampleseed")){ | |
86 sampleseed=Long.parseLong(b); | |
87 } | |
88 | |
89 else if(a.equalsIgnoreCase("size") || a.equalsIgnoreCase("sketchsize")){ | |
90 if("auto".equalsIgnoreCase(b)){ | |
91 AUTOSIZE=true; | |
92 AUTOSIZE_LINEAR=false; | |
93 SET_AUTOSIZE=true; | |
94 }else if("linear".equalsIgnoreCase(b)){ | |
95 AUTOSIZE=false; | |
96 AUTOSIZE_LINEAR=true; | |
97 SET_AUTOSIZE=true; | |
98 }else{ | |
99 AUTOSIZE=false; | |
100 AUTOSIZE_LINEAR=false; | |
101 targetSketchSize=Parse.parseIntKMG(b); | |
102 SET_TARGET_SIZE=true; | |
103 } | |
104 }else if(a.equalsIgnoreCase("maxfraction") || a.equalsIgnoreCase("maxgenomefraction") || a.equalsIgnoreCase("mgf")){ | |
105 maxGenomeFraction=Float.parseFloat(b); | |
106 }else if(a.equalsIgnoreCase("smallsketchsize")){ | |
107 smallSketchSize=Integer.parseInt(b); | |
108 }else if(a.equalsIgnoreCase("minSketchSize")){ | |
109 minSketchSize=Tools.max(1, Integer.parseInt(b)); | |
110 }else if(a.equalsIgnoreCase("autosize")){ | |
111 AUTOSIZE=Parse.parseBoolean(b); | |
112 if(AUTOSIZE){AUTOSIZE_LINEAR=false;} | |
113 SET_AUTOSIZE=true; | |
114 }else if(a.equalsIgnoreCase("autosizefactor") || a.equalsIgnoreCase("autosizefraction") || a.equalsIgnoreCase("autosizemult") || a.equalsIgnoreCase("sizemult")){ | |
115 AUTOSIZE_FACTOR=Float.parseFloat(b); | |
116 SET_AUTOSIZE_FACTOR=true; | |
117 AUTOSIZE_LINEAR=false; | |
118 SET_AUTOSIZE=true; | |
119 }else if(a.equalsIgnoreCase("linear") || a.equalsIgnoreCase("lineardensity") || a.equalsIgnoreCase("density")){ | |
120 if(Tools.isNumeric(b)){ | |
121 AUTOSIZE_LINEAR_DENSITY=Double.parseDouble(b); | |
122 AUTOSIZE_LINEAR=true; | |
123 AUTOSIZE=false; | |
124 SET_AUTOSIZE=true; | |
125 }else{ | |
126 AUTOSIZE_LINEAR=Parse.parseBoolean(b); | |
127 if(AUTOSIZE_LINEAR){AUTOSIZE=false;} | |
128 SET_AUTOSIZE=true; | |
129 } | |
130 }else if(a.equalsIgnoreCase("maxGenomeFractionSmall")){ | |
131 maxGenomeFractionSmall=Float.parseFloat(b); | |
132 }else if(a.equalsIgnoreCase("keyFraction")){ | |
133 double d=Double.parseDouble(b); | |
134 setKeyFraction(d); | |
135 }else if(a.equalsIgnoreCase("prealloc")){ | |
136 if(b==null || Character.isLetter(b.charAt(0))){ | |
137 if(Parse.parseBoolean(b)){ | |
138 prealloc=1.0f; | |
139 }else{ | |
140 prealloc=0; | |
141 } | |
142 }else{ | |
143 prealloc=Float.parseFloat(b); | |
144 } | |
145 } | |
146 | |
147 else if(a.equalsIgnoreCase("intmap")){ | |
148 SketchIndex.useIntMap=Parse.parseBoolean(b); | |
149 }else if(a.equalsIgnoreCase("intmapsize")){ | |
150 SketchIndex.intMapSize=Parse.parseIntKMG(b); | |
151 }else if(a.equalsIgnoreCase("bitsetbits")){ | |
152 assert(false) : "bitsetbits should be 2."; | |
153 // bitSetBits=Integer.parseInt(b); | |
154 } | |
155 | |
156 // else if(a.equalsIgnoreCase("minkmercount") || a.equalsIgnoreCase("minkeycount")){ | |
157 // minKeyOccuranceCount=Integer.parseInt(b); | |
158 // } | |
159 else if(a.equalsIgnoreCase("sketchHeapFactor")){ | |
160 sketchHeapFactor=Tools.max(1.0, Double.parseDouble(b)); | |
161 } | |
162 | |
163 else if(a.equalsIgnoreCase("killok")){ | |
164 KILL_OK=Parse.parseBoolean(b); | |
165 }else if(a.equalsIgnoreCase("ssa") || a.equalsIgnoreCase("usessa")){ | |
166 useSSA=Parse.parseBoolean(b); | |
167 }else if(a.equalsIgnoreCase("ssa3") || a.equalsIgnoreCase("usessa3")){ | |
168 useSSA3=Parse.parseBoolean(b); | |
169 }else if(a.equalsIgnoreCase("exactani")){ | |
170 EXACT_ANI=Parse.parseBoolean(b); | |
171 }else if(a.equalsIgnoreCase("translate") || a.equalsIgnoreCase("callgenes")){ | |
172 translate=Parse.parseBoolean(b); | |
173 defaultParams.translate=translate; | |
174 }else if(a.equalsIgnoreCase("sixframes") || a.equalsIgnoreCase("6frames")){ | |
175 sixframes=Parse.parseBoolean(b); | |
176 defaultParams.sixframes=sixframes; | |
177 if(sixframes){ | |
178 translate=defaultParams.translate=true; | |
179 } | |
180 }else if(a.equalsIgnoreCase("processSSU") || a.equalsIgnoreCase("process16S") || a.equalsIgnoreCase("16S") || a.equalsIgnoreCase("SSU")){ | |
181 processSSU=Parse.parseBoolean(b); | |
182 }else if(a.equalsIgnoreCase("hashSeed")){ | |
183 hashSeed=Long.parseLong(b); | |
184 //remakeCodes(hashSeed); //Happens in postParse | |
185 } | |
186 | |
187 else if(a.equalsIgnoreCase("forceDisableMultithreadedFastq")){ | |
188 forceDisableMultithreadedFastq=Parse.parseBoolean(b); | |
189 } | |
190 | |
191 else if(a.equalsIgnoreCase("verifyentropy")){ | |
192 EntropyTracker.verify=Parse.parseBoolean(b); | |
193 }else if(a.equalsIgnoreCase("entropyK")){ | |
194 entropyK=Integer.parseInt(b); | |
195 }else if(a.equalsIgnoreCase("fastentropy") || a.equalsIgnoreCase("fentropy")){ | |
196 if(Parse.parseBoolean(b)){EntropyTracker.speed=EntropyTracker.FAST;} | |
197 }else if(a.equalsIgnoreCase("mediumentropy") || a.equalsIgnoreCase("mentropy")){ | |
198 if(Parse.parseBoolean(b)){EntropyTracker.speed=EntropyTracker.MEDIUM;} | |
199 }else if(a.equalsIgnoreCase("slowentropy") || a.equalsIgnoreCase("sentropy")){ | |
200 if(Parse.parseBoolean(b)){EntropyTracker.speed=EntropyTracker.SLOW;} | |
201 }else if(a.equalsIgnoreCase("superslowentropy") || a.equalsIgnoreCase("ssentropy")){ | |
202 if(Parse.parseBoolean(b)){EntropyTracker.speed=EntropyTracker.SUPERSLOW;} | |
203 }else if(a.equalsIgnoreCase("verbose2")){ | |
204 verbose2=Parse.parseBoolean(b); | |
205 }else if(a.equalsIgnoreCase("loadSketchesFromSketchFile2")){ | |
206 LOADER2=Parse.parseBoolean(b); | |
207 } | |
208 | |
209 else if(a.equalsIgnoreCase("useToValue2") || a.equalsIgnoreCase("ToValue2")){ | |
210 useToValue2=Parse.parseBoolean(b); | |
211 }else if(a.equals("ssu") || a.equals("ssufile")){ | |
212 assert(false) : "ssufile is deprecated; please specify 16Sfile and/or 18Sfile independently"; | |
213 }else if(a.equalsIgnoreCase("16Sfile")/* || a.equalsIgnoreCase("ssufile") || a.equalsIgnoreCase("silvafile")*/){ | |
214 SSUMap.r16SFile=b; | |
215 if("auto".equalsIgnoreCase(SSUMap.r16SFile)){SSUMap.r16SFile=TaxTree.default16SFile();} | |
216 assert(SSUMap.r16SMap==null); | |
217 }else if(a.equalsIgnoreCase("18Sfile")){ | |
218 SSUMap.r18SFile=b; | |
219 if("auto".equalsIgnoreCase(SSUMap.r18SFile)){SSUMap.r18SFile=TaxTree.default18SFile();} | |
220 assert(SSUMap.r18SMap==null); | |
221 } | |
222 | |
223 // else if(a.equalsIgnoreCase("minHashValue")){ | |
224 // minHashValue=Tools.max(1, Long.parseLong(b)); | |
225 // } | |
226 | |
227 else{ | |
228 return false; | |
229 } | |
230 return true; | |
231 } | |
232 | |
233 private static boolean parseCoding(String arg, String a, String b){ | |
234 if(a.equalsIgnoreCase("deltaout") || a.equalsIgnoreCase("delta")){ | |
235 deltaOut=Parse.parseBoolean(b); | |
236 }else if(a.equalsIgnoreCase("a33") || a.equalsIgnoreCase("a48")){ | |
237 boolean x=Parse.parseBoolean(b); | |
238 if(x){CODING=A48;} | |
239 else if(CODING==A48){CODING=HEX;} | |
240 }else if(a.equalsIgnoreCase("hex")){ | |
241 boolean x=Parse.parseBoolean(b); | |
242 if(x){CODING=HEX;} | |
243 else if(CODING==HEX){CODING=A48;} | |
244 }else if(a.equalsIgnoreCase("raw")){ | |
245 boolean x=Parse.parseBoolean(b); | |
246 if(x){CODING=RAW;} | |
247 else if(CODING==RAW){CODING=A48;} | |
248 }else{ | |
249 return false; | |
250 } | |
251 return true; | |
252 } | |
253 | |
254 static int parseMode(String[] args){ | |
255 int mode=defaultParams.mode; | |
256 for(String arg : args){ | |
257 String[] split=arg.split("="); | |
258 String a=split[0].toLowerCase(); | |
259 String b=split.length>1 ? split[1] : null; | |
260 int x=parseMode(arg, a, b); | |
261 if(x>-1){mode=x;} | |
262 } | |
263 return mode; | |
264 } | |
265 | |
266 static int parseMode(String arg, String a, String b){ | |
267 int mode_=-1; | |
268 if(a.equalsIgnoreCase("mode")){ | |
269 assert(b!=null) : "Bad parameter: "+arg; | |
270 if(Tools.isDigit(b.charAt(0))){ | |
271 mode_=Integer.parseInt(b); | |
272 }else if(b.equalsIgnoreCase("single") || b.equalsIgnoreCase("onesketch")){ | |
273 mode_=ONE_SKETCH; | |
274 }else if(b.equalsIgnoreCase("taxa") || b.equalsIgnoreCase("pertaxa")){ | |
275 mode_=PER_TAXA; | |
276 }else if(b.equalsIgnoreCase("sequence") || b.equalsIgnoreCase("persequence") || b.equalsIgnoreCase("header") || b.equalsIgnoreCase("perheader")){ | |
277 mode_=PER_SEQUENCE; | |
278 // }else if(b.equalsIgnoreCase("header") || b.equalsIgnoreCase("perheader")){ | |
279 // mode_=PER_HEADER; | |
280 }else if(b.equalsIgnoreCase("img")){ | |
281 mode_=PER_IMG; | |
282 }else if(b.equalsIgnoreCase("perfile") || b.equalsIgnoreCase("file")){ | |
283 mode_=PER_FILE; | |
284 }else{ | |
285 assert(false) : "Bad parameter: "+arg; | |
286 } | |
287 }else if(a.equalsIgnoreCase("single") || a.equalsIgnoreCase("onesketch")){ | |
288 mode_=ONE_SKETCH; | |
289 }else if(a.equalsIgnoreCase("taxa") || a.equalsIgnoreCase("pertaxa")){ | |
290 mode_=PER_TAXA; | |
291 }else if(a.equalsIgnoreCase("sequence") || a.equalsIgnoreCase("persequence") || a.equalsIgnoreCase("header") || a.equalsIgnoreCase("perheader")){ | |
292 mode_=PER_SEQUENCE; | |
293 // }else if(a.equalsIgnoreCase("header") || a.equalsIgnoreCase("perheader")){ | |
294 // mode_=PER_HEADER; | |
295 }else if(a.equalsIgnoreCase("perfile") || a.equalsIgnoreCase("file")){ | |
296 mode_=PER_FILE; | |
297 } | |
298 return mode_; | |
299 } | |
300 | |
301 /*--------------------------------------------------------------*/ | |
302 /*---------------- Initialization ----------------*/ | |
303 /*--------------------------------------------------------------*/ | |
304 | |
305 static synchronized void setTaxtree(String taxTreeFile, PrintStream outstream){ | |
306 if(taxTreeFile==null){ | |
307 taxtree=null; | |
308 return; | |
309 } | |
310 if(treefile!=null){ | |
311 assert(!treefile.equals(taxTreeFile)); | |
312 if(treefile.equals(taxTreeFile)){return;} | |
313 treefile=taxTreeFile; | |
314 } | |
315 taxtree=TaxTree.loadTaxTree(taxTreeFile, outstream, hashNames, false); | |
316 } | |
317 | |
318 public static void reset(){ | |
319 postparsed=false; | |
320 blacklist=null; | |
321 useWhitelist=false; | |
322 defaultParams=new DisplayParams(); | |
323 amino=false; | |
324 translate=false; | |
325 sixframes=false; | |
326 } | |
327 | |
328 public static void postParse(){ | |
329 if(postparsed){return;} | |
330 postparsed=true; | |
331 IntList3.defaultMode=IntList3.UNIQUE; //Not really safe, if Seal uses Sketch... | |
332 | |
333 if(defaultParams.amino()){amino=true;} | |
334 if(amino){Shared.AMINO_IN=true;} | |
335 | |
336 if(amino8){ | |
337 AminoAcid.AMINO_SHIFT=3; | |
338 System.err.println("Set AMINO_SHIFT to "+AminoAcid.AMINO_SHIFT); | |
339 } | |
340 | |
341 if(amino || translate){ | |
342 rcomp=false; | |
343 if(k>12){ | |
344 int oldk=k; | |
345 int oldk2=k2; | |
346 // k=Tools.min(k, 63/AminoAcid.AMINO_SHIFT); | |
347 // k2=Tools.min(k2, k); | |
348 k=defaultKAmino; | |
349 k2=defaultK2Amino; | |
350 if(k==k2){k2=0;} | |
351 if(k!=oldk || k2!=oldk2){System.err.println("Set k to "+k+(k2>0 ? ","+k2 : ""));} | |
352 } | |
353 // AUTOSIZE_FACTOR=(AUTOSIZE_FACTOR*defaultK)/k; | |
354 // System.err.println("Set AUTOSIZE_FACTOR to "+AUTOSIZE_FACTOR); | |
355 } | |
356 | |
357 if(aminoOrTranslate()){ | |
358 bitsPerBase=5;//Maybe it is safe to keep these at 4 and 8 or 5 and 8; need to check. | |
359 bitsPerCycle=10; | |
360 }else{ | |
361 bitsPerBase=2; | |
362 bitsPerCycle=8; | |
363 } | |
364 basesPerCycle=bitsPerCycle/bitsPerBase; | |
365 hashCycles=((k*bitsPerBase)+bitsPerCycle-1)/bitsPerCycle; | |
366 | |
367 cycleMask=~((-1L)<<bitsPerCycle); | |
368 maxCycles=(64+bitsPerCycle-1)/bitsPerCycle; | |
369 codeIncrement=(int)(cycleMask+1); | |
370 codes=makeCodes(maxCycles, codeIncrement, hashSeed, false); | |
371 codes1D=makeCodes1D(codes); | |
372 | |
373 if(k2>0){ | |
374 assert(k2<k) : "k2 must be less than k."; | |
375 // assert(k2%basesPerCycle==0) : "k2 must be a multiple of "+basesPerCycle; //No longer required! Still recommended for speed though. | |
376 | |
377 k2mask=~((-1L)<<(bitsPerBase*k2)); | |
378 k2submask=~((-1L)<<(bitsPerBase*(k2%basesPerCycle))); | |
379 k2shift=(k-k2); //for toValue2 | |
380 k2midmask=(k2mask<<((k2shift*bitsPerBase)/2)); //for toValue2 | |
381 hashCycles2=k2/basesPerCycle; | |
382 }else{ | |
383 useToValue2=false; | |
384 } | |
385 codeMax=hashCycles*codeIncrement; | |
386 codeMax2=hashCycles2*codeIncrement; | |
387 | |
388 // hasher=k2<1 ? new Hasher1() : k2submask==0 ? new Hasher2() : new Hasher3(); | |
389 | |
390 if(translate || processSSU){ | |
391 if(pgmFile==null){ | |
392 pgmFile=Data.findPath("?model.pgm"); | |
393 } | |
394 pgm=GeneModelParser.loadModel(pgmFile); | |
395 GeneCaller.call16S=processSSU; | |
396 GeneCaller.call18S=false;//processSSU; | |
397 GeneCaller.call23S=false; | |
398 GeneCaller.call5S=false; | |
399 GeneCaller.calltRNA=false; | |
400 GeneCaller.callCDS=translate; | |
401 | |
402 if(GeneCaller.call16S || GeneCaller.call18S || GeneCaller.call23S){ | |
403 ProkObject.loadLongKmers(); | |
404 ProkObject.loadConsensusSequenceFromFile(true, true); | |
405 } | |
406 } | |
407 | |
408 // if(HASH_VERSION<2 && useToValue2){HASH_VERSION=2;} | |
409 // else if(HASH_VERSION==2 && !useToValue2){HASH_VERSION=1;} | |
410 // assert(blacklist!=null) : blacklist+", "+(blacklist==null); | |
411 if(blacklist!=null){Blacklist.addFiles(blacklist);} | |
412 | |
413 // System.err.println("amino="+amino+", k="+k+", k2="+k2+", bitsPerCycle="+bitsPerCycle+", bitsPerBase="+bitsPerBase+ | |
414 // ", basesPerCycle="+basesPerCycle+", hashCycles="+hashCycles+", k2mask="+k2mask+", k2submask="+k2submask+", hashCycles2="+hashCycles2+ | |
415 // ", codeMax="+codeMax+", codeMax2="+codeMax2); | |
416 // structures.IntList3.ascending=false;//123 for testing | |
417 | |
418 alignerPool=new AlignmentThreadPool(Shared.threads()); | |
419 } | |
420 | |
421 /*--------------------------------------------------------------*/ | |
422 /*---------------- Hashing ----------------*/ | |
423 /*--------------------------------------------------------------*/ | |
424 | |
425 private static void remakeCodes(long hashSeed){ | |
426 long[][] codes2=makeCodes(maxCycles, codeIncrement, hashSeed, false); | |
427 long[] codes1D2=makeCodes1D(codes2); | |
428 for(int i=0; i<codes2.length; i++){ | |
429 for(int j=0; j<codes2[i].length; j++){ | |
430 codes[i][j]=codes2[i][j]; | |
431 } | |
432 } | |
433 for(int i=0; i<codes1D2.length; i++){ | |
434 codes1D[i]=codes1D2[i]; | |
435 } | |
436 } | |
437 | |
438 public static long[][] makeCodes(int symbols, int modes, long seed, boolean positive){ | |
439 Random randy=new Random(seed); | |
440 long mask=positive ? Long.MAX_VALUE : -1L; | |
441 long[][] r=new long[symbols][modes]; | |
442 for(int i=0; i<symbols; i++){ | |
443 for(int j=0; j<modes; j++){ | |
444 r[i][j]=randy.nextLong()&mask; | |
445 } | |
446 } | |
447 for(int i=0; i<3; i++) { | |
448 antialias(r, randy); | |
449 } | |
450 return r; | |
451 } | |
452 | |
453 private static void antialias(long[][] matrix, Random randy){ | |
454 for(long[] array : matrix){ | |
455 antialias(array, randy); | |
456 } | |
457 } | |
458 | |
459 private static void antialias(long[] array, Random randy){ | |
460 for(int i=0; i<64; i++){ | |
461 antialiasNumbers(array, randy); | |
462 antialiasBit(array, randy, i); | |
463 } | |
464 } | |
465 | |
466 private static void antialiasBit(long[] array, Random randy, int bit){ | |
467 int half=array.length/2; | |
468 long ones=0; | |
469 for(int i=0; i<array.length; i++){ | |
470 ones+=(array[i]>>bit)&1L; | |
471 } | |
472 final long orMask=1L<<bit; | |
473 final long andMask=~orMask; | |
474 while(ones<half-1){ | |
475 int loc=randy.nextInt(array.length); | |
476 while((array[loc]&orMask)!=0){ | |
477 loc=randy.nextInt(array.length); | |
478 } | |
479 array[loc]|=orMask; | |
480 ones++; | |
481 } | |
482 while(ones>half+1){ | |
483 int loc=randy.nextInt(array.length); | |
484 while((array[loc]&orMask)==0){ | |
485 loc=randy.nextInt(array.length); | |
486 } | |
487 array[loc]&=andMask; | |
488 ones--; | |
489 } | |
490 } | |
491 | |
492 private static void antialiasNumbers(long[] array, Random randy){ | |
493 for(int i=0; i<array.length; i++){ | |
494 array[i]=antialiasNumber(array[i], randy); | |
495 } | |
496 } | |
497 | |
498 private static long antialiasNumber(long number, Random randy){ | |
499 int ones=Long.bitCount(number); | |
500 while(Long.bitCount(number)<31){ | |
501 number=number|(1L<<randy.nextInt(64)); | |
502 } | |
503 while(Long.bitCount(number)>33){ | |
504 number=number&~(1L<<randy.nextInt(64)); | |
505 } | |
506 return number; | |
507 } | |
508 | |
509 // public static long[] makeCodes1D(int symbols, int modes, long seed, boolean positive){ | |
510 // Random randy=Shared.threadLocalRandom(seed); | |
511 // long mask=positive ? Long.MAX_VALUE : -1L; | |
512 // long[] r=new long[symbols*modes]; | |
513 // for(int i=0; i<r.length; i++){ | |
514 // r[i]=randy.nextLong()&mask; | |
515 // } | |
516 // return r; | |
517 // } | |
518 | |
519 public static long[] makeCodes1D(long[][] codes2D){ | |
520 final int len=codes2D.length*codes2D[0].length; | |
521 long[] codes1D=new long[len]; | |
522 int k=0; | |
523 for(long[] array : codes2D){ | |
524 for(long x : array){ | |
525 codes1D[k]=x; | |
526 k++; | |
527 } | |
528 } | |
529 return codes1D; | |
530 } | |
531 | |
532 public static final long hash(long kmer){//Avoid using this. | |
533 return rcomp ? hash(kmer, AminoAcid.reverseComplementBinaryFast(kmer, k)) : hash(kmer, kmer); | |
534 } | |
535 | |
536 public static final long hash(long kmer, long rkmer){ | |
537 assert(postparsed); | |
538 // return hasher.hash_inner(kmer); //Slow | |
539 // return k2<1 ? hash1(kmer) : hash2(kmer); | |
540 if(useToValue2){return hashToValue2(kmer, rkmer);} | |
541 final long key=toValue(kmer, rkmer); | |
542 return k2<1 ? hash1(key) : k2submask==0 ? hash2(key) : hash3(key); | |
543 } | |
544 | |
545 /** | |
546 * New version. | |
547 * Generates a hash code from a kmer. | |
548 * @param kmer Kmer to hash | |
549 * @return Hash code | |
550 */ | |
551 private static final long hashToValue2(final long kmer0, final long rkmer0){ | |
552 // return Tools.max(Tools.hash64shift(kmer0), Tools.hash64shift(kmer0&0xFFFFFFFFFFFFL));//Something like this would be around 10% faster; not worth it. | |
553 long kmer=kmer0, rkmer=rkmer0; | |
554 final long key; | |
555 final boolean useK1; | |
556 if(rcomp){ | |
557 assert(!amino && !translate); | |
558 final long kmer2=((kmer&k2midmask)>>>k2shift); | |
559 final long rkmer2=((rkmer&k2midmask)>>>k2shift); | |
560 final long max2=Tools.max(kmer2, rkmer2); | |
561 // long cmasked=max2&kmerChoiceMask; | |
562 // useK1=kmerChoiceBitset.get((int)cmasked); | |
563 useK1=((max2%4999L)&1L)==0L; | |
564 key=useK1 ? Tools.max(kmer, rkmer) : max2; | |
565 // System.err.println("\n"+longer+", "+max2+", "+(max2%4999L)+", "+((max2%4999L)&1L)+ | |
566 // ", "+Long.toHexString(kmer)+", "+Long.toHexString(max2)+", "+Long.toHexString(value)); | |
567 // long value=longer ? (kmer2>rkmer2 ? kmer : rkmer) : max2; | |
568 // System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+ | |
569 // "\n"+AminoAcid.kmerToString(kmer2, k2)+"\n"+AminoAcid.kmerToString(rkmer2, k2)+"\n"+AminoAcid.kmerToString(value, k)+"\n"); | |
570 // assert(kmer2==AminoAcid.reverseComplementBinaryFast(rkmer2, k2));) | |
571 }else{ | |
572 assert(amino || translate); | |
573 final long kmer2=(kmer&k2mask); | |
574 useK1=((kmer2%4999)&1L)==0L; | |
575 key=useK1 ? kmer : kmer2; | |
576 } | |
577 // if(key%5!=0){return -1;}//This makes it a little faster. | |
578 final long bit=useK1 ? 0 : 1; //Note that this gets reversed later during the subtraction process | |
579 | |
580 // System.err.println(bit+", "+Long.toHexString(kmer)+", "+Long.toHexString(k2mask)); | |
581 // assert(bit==0) : bit+", "+Long.toHexString(kmer); | |
582 | |
583 long hashcode=key, data=key; | |
584 // for(int i=0; i<codeMax; i+=codeIncrement){ | |
585 // int x=(int)(data&cycleMask); | |
586 // data>>>=bitsPerCycle; | |
587 // hashcode^=codes1D[i+x]; | |
588 // } | |
589 // for(int i=0; data!=0; i+=codeIncrement){ | |
590 // int x=(int)(data&cycleMask); | |
591 // data>>>=bitsPerCycle; | |
592 // hashcode^=codes1D[i+x]; | |
593 // } | |
594 {//5% faster than other loops. | |
595 int i=0; | |
596 do{ | |
597 int x=(int)(data&cycleMask); | |
598 data>>>=bitsPerCycle; | |
599 hashcode^=codes1D[i+x]; | |
600 i+=codeIncrement; | |
601 }while(data!=0); | |
602 } | |
603 hashcode=(hashcode&(-2L))|bit; | |
604 | |
605 return hashcode; | |
606 } | |
607 | |
608 /** | |
609 * Fastest version! | |
610 * Generates a hash code from a kmer. | |
611 * @param kmer Kmer to hash | |
612 * @return Hash code | |
613 */ | |
614 private static final long hash1(long kmer){ | |
615 // if(kmer%5!=0){return -1;} | |
616 long code=kmer; | |
617 for(int i=0; i<codeMax; i+=codeIncrement){ | |
618 int x=(int)(kmer&cycleMask); | |
619 kmer>>=bitsPerCycle; | |
620 code^=codes1D[i+x]; | |
621 } | |
622 | |
623 return code; | |
624 } | |
625 | |
626 | |
627 /** | |
628 * Generates a hash code from a kmer, using dual kmer lengths. | |
629 * @param kmer Kmer to hash | |
630 * @return Hash code | |
631 */ | |
632 private static final long hash2(final long kmer0){ | |
633 // return Tools.hash64shift(kmer0); | |
634 long kmer=kmer0; | |
635 long code=0; | |
636 | |
637 for(int i=0; i<codeMax2; i+=codeIncrement){ | |
638 int x=(int)(kmer&cycleMask); | |
639 kmer>>=bitsPerCycle; | |
640 code^=codes1D[i+x]; | |
641 } | |
642 final long code2=code^(k2mask&kmer0); | |
643 for(int i=codeMax2; i<codeMax; i+=codeIncrement){ | |
644 int x=(int)(kmer&cycleMask); | |
645 kmer>>=bitsPerCycle; | |
646 code^=codes1D[i+x]; | |
647 } | |
648 return ((code&1)==1) ? code2 : code^kmer0; //Random; faster than max. | |
649 } | |
650 | |
651 | |
652 /** | |
653 * Generates a hash code from a kmer, using dual kmer lengths, allowing K2 to be a non-multiple-of-4. | |
654 * Uses one additional and, xor, and lookup. | |
655 * @param kmer Kmer to hash | |
656 * @return Hash code | |
657 */ | |
658 private static final long hash3(final long kmer0){ | |
659 // return Tools.hash64shift(kmer0); | |
660 | |
661 long kmer=kmer0; | |
662 long code=0; | |
663 assert(k2submask!=0); | |
664 | |
665 int i=0; | |
666 for(; i<codeMax2; i+=codeIncrement){ | |
667 int x=(int)(kmer&cycleMask); | |
668 kmer>>=bitsPerCycle; | |
669 code^=codes1D[i+x]; | |
670 } | |
671 final int residual=(int)(kmer&k2submask); | |
672 final long code2=code^(k2mask&kmer0)^codes1D[i+residual]; | |
673 for(; i<codeMax; i+=codeIncrement){ | |
674 int x=(int)(kmer&cycleMask); | |
675 kmer>>=bitsPerCycle; | |
676 code^=codes1D[i+x]; | |
677 } | |
678 return ((code&1)==1) ? code2 : code^kmer0; //Random; faster than max. | |
679 } | |
680 | |
681 | |
682 /*--------------------------------------------------------------*/ | |
683 /*---------------- Convenience Methods ----------------*/ | |
684 /*--------------------------------------------------------------*/ | |
685 | |
686 public static final float align(byte[] query, byte[] ref){ | |
687 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA(); | |
688 int a=0, b=ref.length-1; | |
689 int[] max=ssa.fillUnlimited(query, ref, a, b, -9999); | |
690 if(max==null){return 0;} | |
691 | |
692 final int rows=max[0]; | |
693 final int maxCol=max[1]; | |
694 final int maxState=max[2]; | |
695 final float id=ssa.tracebackIdentity(query, ref, a, b, rows, maxCol, maxState, null); | |
696 return id; | |
697 } | |
698 | |
699 public static final float alignAndMakeMatch(Read r, byte[] ref){ | |
700 byte[] query=r.bases; | |
701 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA(); | |
702 int a=0, b=ref.length-1; | |
703 int[] max=ssa.fillUnlimited(query, ref, a, b, -9999); | |
704 if(max==null){return 0;} | |
705 | |
706 final int rows=max[0]; | |
707 final int maxCol=max[1]; | |
708 final int maxState=max[2]; | |
709 byte[] match=ssa.traceback(query, ref, a, b, rows, maxCol, 0); | |
710 int[] score=ssa.score(query, ref, a, b, rows, maxCol, 0); | |
711 final float id=Read.identity(match); | |
712 r.match=match; | |
713 r.start=score[1]; | |
714 r.stop=score[2]; | |
715 r.setMapped(true); | |
716 return id; | |
717 } | |
718 | |
719 public static final float alignAndMakeMatch(Read r, byte[] ref, float[] refWeights/*, float[] insWeights, float[] delWeights*/){ | |
720 byte[] query=r.bases; | |
721 SingleStateAlignerFlatFloat ssa=GeneCaller.getSSAF(); | |
722 // ssa.setWeights(refWeights, insWeights, delWeights); | |
723 ssa.setWeights(refWeights); | |
724 int a=0, b=ref.length-1; | |
725 int[] max=ssa.fillUnlimited(query, ref, a, b, -9999); | |
726 if(max==null){return 0;} | |
727 | |
728 final int rows=max[0]; | |
729 final int maxCol=max[1]; | |
730 final int maxState=max[2]; | |
731 byte[] match=ssa.traceback(query, ref, a, b, rows, maxCol, 0); | |
732 int[] score=ssa.score(query, ref, a, b, rows, maxCol, 0); | |
733 final float id=Read.identity(match); | |
734 r.match=match; | |
735 r.start=score[1]; | |
736 r.stop=score[2]; | |
737 r.setMapped(true); | |
738 return id; | |
739 } | |
740 | |
741 static String fixMeta(String s){ | |
742 if(s==null){return null;} | |
743 int colon=s.indexOf(':'); | |
744 assert(colon>=0); | |
745 if(s.length()==colon+5 && s.endsWith(":null")){return null;} | |
746 return s.replace('\t', ' '); | |
747 } | |
748 | |
749 static ArrayList<String> fixMeta(ArrayList<String> list){ | |
750 if(list==null || list.isEmpty()){return null;} | |
751 for(int i=0; i<list.size(); i++){ | |
752 String s=list.get(i); | |
753 s=fixMeta(s); | |
754 if(s==null){ | |
755 list.remove(i); | |
756 i--; | |
757 }else{ | |
758 list.set(i, s); | |
759 } | |
760 } | |
761 if(list.isEmpty()){return null;} | |
762 list.trimToSize(); | |
763 Collections.sort(list); | |
764 return list; | |
765 } | |
766 | |
767 public static final float aniToWkid(final double ani){ | |
768 final float wkid; | |
769 if(ani<=0){ | |
770 wkid=0; | |
771 }else if(k2<1){ | |
772 wkid=(float)Math.pow(ani, k); | |
773 }else{ | |
774 wkid=0.5f*(float)(Math.pow(ani, k)+Math.pow(ani, k2)); | |
775 } | |
776 return wkid; | |
777 } | |
778 | |
779 public static final float wkidToAniExact(final double wkid, final int A, final int B, final double maxDeviation){ | |
780 assert(A>B); | |
781 assert(maxDeviation>0); | |
782 final double logWkid=Math.log(wkid); | |
783 final double invA=1.0/A; | |
784 final double invB=1.0/B; | |
785 | |
786 double aniUpper=Math.exp(logWkid*invA); | |
787 double aniLower=Math.exp(logWkid*invB); | |
788 assert(aniLower<=aniUpper); | |
789 double aniEst=(aniLower+aniUpper)*0.5f; | |
790 // System.err.println(aniEst); | |
791 // if(B>16){//Fast mode | |
792 // aniUpper=(aniUpper+3*aniEst)*0.25; | |
793 // aniLower=(aniLower+3*aniEst)*0.25; | |
794 // } | |
795 double wkidEst=aniToWkid(aniEst, A, B); | |
796 int iters=1; | |
797 // System.err.println(aniLower+"\t"+aniUpper+"\t"+aniEst+"\t"+aniToWkid(aniEst, A, B)+"\t"+(wkidEst-wkid)); | |
798 while(Math.abs(wkidEst-wkid)>maxDeviation && iters<40){ | |
799 if(wkidEst<wkid){aniLower=aniEst;} | |
800 else{aniUpper=aniEst;} | |
801 aniEst=(aniLower+aniUpper)*0.5f; | |
802 wkidEst=aniToWkid(aniEst, A, B); | |
803 // System.err.println(aniLower+"\t"+aniUpper+"\t"+aniEst+"\t"+wkidEst+"\t"+(wkidEst-wkid)); | |
804 iters++; | |
805 } | |
806 // System.err.println("Iterations: "+iters); | |
807 return (float)aniEst; | |
808 } | |
809 | |
810 public static double aniToWkid(double ani, int A, int B){ | |
811 if(A<B){int C=A; A=B; B=C;}//Swap | |
812 double aPow=Math.pow(ani, A); | |
813 double bPow=Math.pow(ani, B); | |
814 // return 0.5*(aPow+bPow); | |
815 return useToValue2 ? 0.5*(aPow+bPow) : 0.5*(aPow+(bPow*0.4+aPow*0.6)); | |
816 } | |
817 | |
818 //From Kayla | |
819 // public static double aniToWkid(double ANI, int K2, int K1){ | |
820 // if(K2<K1){int C=K2; K2=K1; K1=C;}//Swap | |
821 // | |
822 // return 0.5*((1 - (1 - Math.pow(ANI, K2-K1))*Math.pow(ANI, K1))*Math.pow(ANI, K2) + | |
823 // (1 + (1 - Math.pow(ANI, K2-K1))*Math.pow(ANI, K1))*Math.pow(ANI, K1)); | |
824 // } | |
825 | |
826 public static double aniToWkid(double ani, int A){ | |
827 return Math.pow(ani, A); | |
828 } | |
829 | |
830 public static final float wkidToAniExact(final double wkid, final int k){ | |
831 return (float)Math.exp(Math.log(wkid)/(k)); | |
832 } | |
833 | |
834 public static final float wkidToAni(final double wkid){ | |
835 final float ani; | |
836 if(wkid<=0){ | |
837 ani=0; | |
838 }else if(k2<1){ | |
839 ani=(float)Math.exp(Math.log(wkid)/k); | |
840 }else{ | |
841 if(EXACT_ANI){return wkidToAniExact(wkid, k, k2, 0.00005);} | |
842 | |
843 //This is linear interpolation which is not correct. But for some reason it works really well. | |
844 final double log=Math.log(wkid); | |
845 | |
846 // double ani1=Math.exp(log/k); | |
847 // double ani2=Math.exp(log/k2); | |
848 // ani=(float)(0.5*(ani1+ani2)); | |
849 | |
850 //alternatively... this one seems to work better. | |
851 // ani=(float)Math.exp(2*log/(k*1.1+k2*0.9)); | |
852 ani=(float)Math.exp(2*log/(1.2*k+.8*k2)); | |
853 } | |
854 return ani; | |
855 } | |
856 | |
857 // public static void kmerArrayToSketchArray(long[] kmers){ | |
858 // for(int i=0; i<kmers.length; i++){ | |
859 // long kmer=kmers[i]; | |
860 // long hash=SketchTool.hash(kmer); | |
861 // assert(hash>=minHashValue); | |
862 // hash=Long.MAX_VALUE-hash; | |
863 // kmers[i]=hash; | |
864 // } | |
865 // Shared.sort(kmers); | |
866 // } | |
867 | |
868 public static void hashArrayToSketchArray(long[] keys){ | |
869 for(int i=0; i<keys.length; i++){ | |
870 long hash=keys[i]; | |
871 assert(hash>=minHashValue); | |
872 hash=Long.MAX_VALUE-hash; | |
873 keys[i]=hash; | |
874 } | |
875 Shared.sort(keys); | |
876 } | |
877 | |
878 public static final long genomeSizeEstimate(long min, int length) { | |
879 if(length==0){return 0;} | |
880 double est=((double)Long.MAX_VALUE)*2*length/(Tools.max(min, 1)); | |
881 // if(k2>0){est*=0.5;} //This is necessary if the hash function uses max, but not random. | |
882 // System.err.println("max="+Long.MAX_VALUE+", min="+min+", len="+length+", est="+(long)est); | |
883 // new Exception().printStackTrace(System.err); | |
884 return (long)Math.ceil(est); | |
885 } | |
886 | |
887 public static final int toSketchSize(long genomeSizeBases, long genomeSizeKmers, long genomeSizeEstimate, int maxSketchSize){ | |
888 // assert(false) : genomeSizeBases+", "+genomeSizeKmers+", "+genomeSizeEstimate+", "+maxSketchSize+", "+useSizeEstimate; | |
889 if(genomeSizeEstimate>0 && useSizeEstimate){ | |
890 return toSketchSizeKmers(genomeSizeEstimate, maxSketchSize); | |
891 } | |
892 if(genomeSizeKmers>0){ | |
893 return toSketchSizeKmers(genomeSizeKmers, maxSketchSize); | |
894 } | |
895 assert(genomeSizeBases>0) : "BBSketch does not currently support empty files.\n" | |
896 +genomeSizeBases+", "+genomeSizeKmers+", "+genomeSizeEstimate+", "+maxSketchSize+", "+useSizeEstimate; | |
897 return toSketchSizeBases(genomeSizeBases, maxSketchSize); | |
898 } | |
899 | |
900 private static final int toSketchSizeBases(long genomeSizeBases, int maxSketchSize) { | |
901 return toSketchSizeKmers(Tools.max(0, genomeSizeBases-k+1), maxSketchSize); | |
902 } | |
903 | |
904 private static final int toSketchSizeKmers(long genomeSizeKmers, int maxSketchSize) { | |
905 // System.err.println(genomeSizeKmers+", "+maxSketchSize); | |
906 // new Exception().printStackTrace(); | |
907 if(AUTOSIZE){ | |
908 if(aminoOrTranslate()){ | |
909 float linear1=Tools.min(60+0.5f*(float)Math.sqrt(genomeSizeKmers), | |
910 maxGenomeFractionSmall*genomeSizeKmers*2); | |
911 float linear2=genomeSizeKmers*maxGenomeFraction*2; | |
912 float poly=0+1f*(float)Math.sqrt(genomeSizeKmers)+90f*(float)Math.pow(genomeSizeKmers, 0.3); | |
913 float log=Tools.max(1000, -4000+3500*(float)Math.log(Tools.max(1, genomeSizeKmers))); | |
914 float min=Tools.min(Tools.max(linear1, linear2), poly, log); | |
915 assert(min>=Integer.MIN_VALUE && min<=Integer.MAX_VALUE) : min; | |
916 | |
917 // System.err.println(genomeSizeKmers+" -> "+linear1+", "+linear2+", "+poly+", "+log); | |
918 | |
919 return (int)Tools.min(genomeSizeKmers*keyFraction2, min*AUTOSIZE_FACTOR); | |
920 // return (int)Tools.max(1, min*AUTOSIZE_FACTOR); | |
921 }else{ | |
922 float linear1=Tools.min(smallSketchSize+0.5f*(float)Math.sqrt(genomeSizeKmers), | |
923 maxGenomeFractionSmall*genomeSizeKmers-10); | |
924 float linear2=genomeSizeKmers*maxGenomeFraction; | |
925 float poly=0+1f*(float)Math.sqrt(genomeSizeKmers)+90f*(float)Math.pow(genomeSizeKmers, 0.3); | |
926 float log=Tools.max(1000, -4000+3500*(float)Math.log(Tools.max(1, genomeSizeKmers))+8f*(float)Math.pow(genomeSizeKmers, 0.3)); | |
927 float min=Tools.min(Tools.max(linear1, linear2), poly, log); | |
928 assert(min>=Integer.MIN_VALUE && min<=Integer.MAX_VALUE) : min; | |
929 | |
930 // System.err.println(genomeSizeKmers+" -> "+linear1+", "+linear2+", "+poly+", "+log); | |
931 | |
932 int result=(int)Tools.min(genomeSizeKmers*keyFraction2, min*AUTOSIZE_FACTOR); | |
933 // System.err.println(result); | |
934 return result; | |
935 // return (int)Tools.max(1, min*AUTOSIZE_FACTOR); | |
936 } | |
937 }else if(AUTOSIZE_LINEAR){ | |
938 if(aminoOrTranslate()){ | |
939 return (int)(Tools.min(10000000, Tools.min(maxGenomeFractionSmall, AUTOSIZE_LINEAR_DENSITY)*genomeSizeKmers)); | |
940 }else{ | |
941 return (int)(Tools.min(10000000, Tools.min(maxGenomeFractionSmall, AUTOSIZE_LINEAR_DENSITY)*genomeSizeKmers)); | |
942 } | |
943 }else{ | |
944 if(aminoOrTranslate()){//kmers are roughly 3x smaller... | |
945 return Tools.min((int)(2+3*maxGenomeFraction*genomeSizeKmers), maxSketchSize); | |
946 }else{ | |
947 return Tools.min((int)(2+maxGenomeFraction*genomeSizeKmers), maxSketchSize); | |
948 } | |
949 } | |
950 } | |
951 | |
952 // static final long toValue2(long kmer, long rkmer){ | |
953 // assert(k2>0 && k2<k) : "k="+k+","+k2; | |
954 // | |
955 // final long value; | |
956 // if(rcomp){ | |
957 // assert(!amino && !translate); | |
958 //// if(!rcomp){return kmer;} | |
959 // long kmer2=((kmer&k2midmask)>>>k2shift); | |
960 // long rkmer2=((rkmer&k2midmask)>>>k2shift); | |
961 // | |
962 //// assert(kmer2>=0); | |
963 //// assert(kmer<0 || kmer2<=kmer); | |
964 //// assert(rkmer<0 || rkmer2<=kmer); | |
965 // | |
966 //// assert(false) : "\n"+Long.toBinaryString(kmer)+ | |
967 //// "\n"+Long.toBinaryString(rkmer)+ | |
968 //// "\n"+Long.toBinaryString(kmer2)+ | |
969 //// "\n"+Long.toBinaryString(rkmer2); | |
970 // | |
971 // //TODO: Slow | |
972 //// assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) : | |
973 //// "\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"+AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"; | |
974 //// assert(kmer2==AminoAcid.reverseComplementBinaryFast(rkmer2, k2)); | |
975 // long max2=Tools.max(kmer2, rkmer2); | |
976 //// long cmasked=max2&kmerChoiceMask; | |
977 //// boolean longer=kmerChoiceBitset.get((int)cmasked); | |
978 // boolean longer=((max2%4999L)&1L)==0L; | |
979 // value=longer ? Tools.max(kmer, rkmer) : max2; | |
980 //// System.err.println("\n"+longer+", "+max2+", "+(max2%4999L)+", "+((max2%4999L)&1L)+ | |
981 //// ", "+Long.toHexString(kmer)+", "+Long.toHexString(max2)+", "+Long.toHexString(value)); | |
982 //// long value=longer ? (kmer2>rkmer2 ? kmer : rkmer) : max2; | |
983 //// System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+ | |
984 //// "\n"+AminoAcid.kmerToString(kmer2, k2)+"\n"+AminoAcid.kmerToString(rkmer2, k2)+"\n"+AminoAcid.kmerToString(value, k)+"\n"); | |
985 //// assert(kmer2==AminoAcid.reverseComplementBinaryFast(rkmer2, k2));) | |
986 // }else{ | |
987 // assert(amino || translate); | |
988 //// long kmer2=(kmer&k2mask); | |
989 // long kmer2=((kmer&k2midmask)>>>k2shift); | |
990 // boolean longer=((kmer2%4999)&1)==0; | |
991 // value=longer ? kmer : kmer2; | |
992 // } | |
993 // return value; | |
994 // } | |
995 | |
996 private static final long toValue(long kmer, long rkmer){ | |
997 // assert(toValue2(kmer, rkmer)==toValue2(rkmer, kmer)); | |
998 assert(!useToValue2); | |
999 // if(useToValue2){return toValue2(kmer, rkmer);} | |
1000 long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); | |
1001 return value; | |
1002 } | |
1003 | |
1004 /*--------------------------------------------------------------*/ | |
1005 /*---------------- Constants ----------------*/ | |
1006 /*--------------------------------------------------------------*/ | |
1007 | |
1008 public static final int RAW=0, HEX=1, A48=2; | |
1009 public static final char[] codingArray={'R', 'H', 'A'}; | |
1010 | |
1011 static final byte[] hexTable=new byte[128]; | |
1012 static { | |
1013 Arrays.fill(hexTable, (byte)-1); | |
1014 for(int i='0'; i<='9'; i++){ | |
1015 hexTable[i]=(byte)(i-'0'); | |
1016 } | |
1017 for(int i='A'; i<='F'; i++){ | |
1018 hexTable[i]=hexTable[i+'a'-'A']=(byte)(i-'A'+10); | |
1019 } | |
1020 hexTable['x']=hexTable['X']=hexTable['-']=hexTable['+']=0; | |
1021 } | |
1022 | |
1023 public static final int ONE_SKETCH=1, PER_SEQUENCE=2, PER_TAXA=3, PER_IMG=4/*, PER_HEADER=5*/, PER_FILE=6; | |
1024 | |
1025 /*--------------------------------------------------------------*/ | |
1026 /*---------------- Default Locations ----------------*/ | |
1027 /*--------------------------------------------------------------*/ | |
1028 | |
1029 // public static ArrayList<String> IMG_PATH(){return makePaths(IMG_PATH, 31);} | |
1030 // public static ArrayList<String> NT_PATH(){return makePaths(NT_PATH, 31);} | |
1031 // public static ArrayList<String> NR_PATH(){throw new RuntimeException("NR is not currently available.");} | |
1032 // public static ArrayList<String> REFSEQ_PATH(){return makePaths(REFSEQ_PATH, 31);} | |
1033 // public static ArrayList<String> SILVA_PATH(){return makePaths(SILVA_PATH, 31);} | |
1034 | |
1035 private static ArrayList<String> makePaths(String pattern, int files){ | |
1036 ArrayList<String> list=new ArrayList<String>(files); | |
1037 for(int i=0; i<files; i++){ | |
1038 list.add(pattern.replace("#", ""+i)); | |
1039 } | |
1040 return list; | |
1041 } | |
1042 | |
1043 private static final String IMG_PATH="/global/cfs/cdirs/bbtools/img/current/img#.sketch"; | |
1044 private static final String NT_PATH="/global/cfs/cdirs/bbtools/nt/current/taxa#.sketch"; | |
1045 private static final String NR_PATH="/global/cfs/cdirs/bbtools/nr/current/taxa#.sketch"; | |
1046 private static final String REFSEQ_PATH="/global/cfs/cdirs/bbtools/refseq/current/taxa#.sketch"; | |
1047 private static final String REFSEQ_PATH_BIG="/global/cfs/cdirs/bbtools/refseq/current/big#.sketch"; | |
1048 private static final String SILVA_PATH="/global/cfs/cdirs/bbtools/silva/latest/both_taxa#.sketch"; | |
1049 private static final String PROKPROT_PATH="/global/cfs/cdirs/bbtools/refseq/current/prot/taxa#.sketch"; | |
1050 private static final String PROKPROT_PATH_BIG="/global/cfs/cdirs/bbtools/refseq/current/prot/big#.sketch"; | |
1051 private static final String MITO_PATH="/global/cfs/cdirs/bbtools/mito2/taxa#.sketch"; | |
1052 private static final String FUNGI_PATH="/global/cfs/cdirs/bbtools/mito2/fungi#.sketch"; | |
1053 | |
1054 private static final String IMG_PATH_IGBVM="/data/sketch/img/current/img#.sketch"; | |
1055 private static final String NT_PATH_IGBVM="/data/sketch/nt/current/taxa#.sketch"; | |
1056 private static final String NR_PATH_IGBVM="/data/sketch/nr/current/taxa#.sketch"; | |
1057 private static final String REFSEQ_PATH_IGBVM="/data/sketch/refseq/current/taxa#.sketch"; | |
1058 private static final String REFSEQ_PATH_BIG_IGBVM="/data/sketch/refseq/current/big#.sketch"; | |
1059 private static final String SILVA_PATH_IGBVM="/data/sketch/silva/current/both_taxa#.sketch"; | |
1060 private static final String PROKPROT_PATH_IGBVM="/data/sketch/refseq/current/prot/taxa#.sketch"; | |
1061 private static final String PROKPROT_PATH_BIG_IGBVM="/data/sketch/refseq/current/prot/big#.sketch"; | |
1062 private static final String MITO_PATH_IGBVM="/data/sketch/mito2/taxa#.sketch"; | |
1063 private static final String FUNGI_PATH_IGBVM="/data/sketch/mito2/fungi#.sketch"; | |
1064 | |
1065 private static final String IMG_PATH_AWS=null; | |
1066 private static final String NT_PATH_AWS="/test1/sketch/latest/nt/taxa#.sketch"; | |
1067 private static final String NR_PATH_AWS=null; | |
1068 private static final String REFSEQ_PATH_AWS="/test1/sketch/latest/refseq/taxa#.sketch"; | |
1069 private static final String SILVA_PATH_AWS="/test1/sketch/latest/ribo/both_taxa#.sketch"; | |
1070 private static final String PROKPROT_PATH_AWS="/test1/sketch/latest/protein/taxa#.sketch"; | |
1071 private static final String MITO_PATH_AWS=null; | |
1072 private static final String FUNGI_PATH_AWS=null; | |
1073 | |
1074 public static final String IMG_PATH(){return Shared.IGBVM ? IMG_PATH_IGBVM : Shared.AWS ? IMG_PATH_AWS : IMG_PATH;} | |
1075 public static final String NT_PATH(){return Shared.IGBVM ? NT_PATH_IGBVM : Shared.AWS ? NT_PATH_AWS : NT_PATH;} | |
1076 public static final String NR_PATH(){return Shared.IGBVM ? NR_PATH_IGBVM : Shared.AWS ? NR_PATH_AWS : NR_PATH;} | |
1077 public static final String REFSEQ_PATH(){return Shared.IGBVM ? REFSEQ_PATH_IGBVM : Shared.AWS ? REFSEQ_PATH_AWS : REFSEQ_PATH;} | |
1078 public static final String REFSEQ_PATH_BIG(){return Shared.IGBVM ? REFSEQ_PATH_BIG_IGBVM : REFSEQ_PATH_BIG;} | |
1079 public static final String SILVA_PATH(){return Shared.IGBVM ? SILVA_PATH_IGBVM : Shared.AWS ? SILVA_PATH_AWS : SILVA_PATH;} | |
1080 public static final String PROKPROT_PATH(){return Shared.IGBVM ? PROKPROT_PATH_IGBVM : Shared.AWS ? PROKPROT_PATH_AWS : PROKPROT_PATH;} | |
1081 public static final String PROKPROT_PATH_BIG(){return Shared.IGBVM ? PROKPROT_PATH_BIG_IGBVM : PROKPROT_PATH_BIG;} | |
1082 public static final String MITO_PATH(){return Shared.IGBVM ? MITO_PATH_IGBVM : Shared.AWS ? MITO_PATH_AWS : MITO_PATH;} | |
1083 public static final String FUNGI_PATH(){return Shared.IGBVM ? FUNGI_PATH_IGBVM : Shared.AWS ? FUNGI_PATH_AWS : FUNGI_PATH;} | |
1084 | |
1085 /*--------------------------------------------------------------*/ | |
1086 /*---------------- Getters ----------------*/ | |
1087 /*--------------------------------------------------------------*/ | |
1088 | |
1089 public static boolean useWhitelist() {return useWhitelist;} | |
1090 public static String blacklist() {return blacklist;} | |
1091 | |
1092 /*--------------------------------------------------------------*/ | |
1093 /*---------------- Static Fields ----------------*/ | |
1094 /*--------------------------------------------------------------*/ | |
1095 | |
1096 public static int CODING=A48; | |
1097 public static boolean deltaOut=true; | |
1098 public static boolean rcomp=true; | |
1099 | |
1100 public static final int defaultK=32; | |
1101 public static final int defaultK2=24; | |
1102 public static final int defaultKAmino=12; | |
1103 public static final int defaultK2Amino=9; | |
1104 public static int k=defaultK; | |
1105 public static int k2=defaultK2; | |
1106 public static int entropyK=3; | |
1107 public static boolean setK=false; | |
1108 public static boolean amino=false; | |
1109 public static boolean amino8=false; | |
1110 public static boolean translate=false; | |
1111 public static boolean sixframes=false; | |
1112 public static boolean processSSU=true; | |
1113 public static int min_SSU_len=1000; | |
1114 public static int HASH_VERSION=2; | |
1115 public static String pgmFile=null; | |
1116 public static GeneModel pgm=null; | |
1117 | |
1118 static boolean aminoOrTranslate(){return amino | translate;} | |
1119 | |
1120 private static int bitsPerCycle=8; //Optimal speed for K=31 is 9bpc (15% faster than 8). 9, 10, and 11 are similar. | |
1121 private static long cycleMask=~((-1L)<<bitsPerCycle); | |
1122 private static final long codeOrMask=1L; | |
1123 private static final long codeAndMask=~1L; | |
1124 private static int maxCycles=(64+bitsPerCycle-1)/bitsPerCycle; | |
1125 private static int codeIncrement=(int)(cycleMask+1); | |
1126 private static int codeMax; | |
1127 private static int codeMax2; | |
1128 | |
1129 private static long hashSeed=12345; | |
1130 private static long[][] codes;//=makeCodes(maxCycles, codeIncrement, hashSeed, false); | |
1131 private static long[] codes1D;//=makeCodes1D(codes); | |
1132 static boolean useToValue2=true; | |
1133 | |
1134 //These are set in postParse() | |
1135 private static int bitsPerBase=2; | |
1136 private static int basesPerCycle=bitsPerCycle/bitsPerBase; | |
1137 private static int hashCycles=64/bitsPerCycle; //Note: Needs to be variable based on k to make k2 codes compatible with k codes | |
1138 private static int hashCycles2; | |
1139 private static long k2mask; | |
1140 private static long k2submask; | |
1141 //Difference in length of k and k2, in bits | |
1142 private static int k2shift; | |
1143 private static long k2midmask; | |
1144 | |
1145 /** | |
1146 * Make the SketchHeap size this factor bigger, | |
1147 * when minKeyOccuranceCount is used | |
1148 */ | |
1149 public static double sketchHeapFactor=8; | |
1150 // static int minKeyOccuranceCount=1; | |
1151 | |
1152 public static int minSketchSize=3; | |
1153 public static int targetSketchSize=10000; | |
1154 public static boolean AUTOSIZE=true; | |
1155 public static float AUTOSIZE_FACTOR=1; | |
1156 public static boolean SET_AUTOSIZE_FACTOR=false; | |
1157 public static boolean SET_AUTOSIZE=false; | |
1158 public static boolean SET_TARGET_SIZE=false; | |
1159 public static boolean AUTOSIZE_LINEAR=false; | |
1160 public static double AUTOSIZE_LINEAR_DENSITY=0.001; | |
1161 public static float maxGenomeFraction=0.04f;//Was 0.015, but that is often too sparse | |
1162 public static float maxGenomeFractionSmall=0.10f; | |
1163 public static int smallSketchSize=150; | |
1164 public static boolean makeIndex=true; | |
1165 public static float prealloc=0; | |
1166 public static boolean allToAll=false; | |
1167 public static boolean compareSelf=false; | |
1168 public static boolean skipCompare=false; | |
1169 public static final int bitSetBits=2; //Needs to be 2 for unique counts. | |
1170 | |
1171 private static double keyFraction=0.16; | |
1172 private static double keyFraction2=keyFraction*1.2; | |
1173 public static long minHashValue=setMinHashValue(); | |
1174 public static double keyFraction(){return keyFraction;} | |
1175 public static void setKeyFraction(double d){ | |
1176 assert(d>0); | |
1177 keyFraction=Tools.mid(0.0001, d, 0.5); | |
1178 keyFraction2=Tools.mid(0.0001, keyFraction*1.2, 0.5); | |
1179 } | |
1180 public static long setMinHashValue(){ | |
1181 double mult=1-2*keyFraction; | |
1182 minHashValue=(long)(mult*Long.MAX_VALUE); | |
1183 assert(minHashValue>=0 && minHashValue<Long.MAX_VALUE); | |
1184 return minHashValue; | |
1185 } | |
1186 | |
1187 public static int minFakeID=1900000000; | |
1188 static boolean hashNames=false; | |
1189 static boolean skipNonCanonical=true; | |
1190 static boolean useSizeEstimate=true; | |
1191 public static boolean allowMultithreadedFastq=false; | |
1192 static boolean forceDisableMultithreadedFastq=false; | |
1193 | |
1194 static boolean preferSSUMap=false; | |
1195 static boolean preferSSUMapForEuks=true; | |
1196 static boolean useSSUMapOnly=false; | |
1197 static boolean useSSUMapOnlyForEuks=false; | |
1198 static boolean ban18SForProks=true; | |
1199 static boolean ban16SForEuks=true; | |
1200 | |
1201 static long sampleseed=-1L; | |
1202 | |
1203 public static TaxTree taxtree=null; | |
1204 private static String treefile=null; | |
1205 static String blacklist=null; | |
1206 static boolean useWhitelist=false; | |
1207 private static boolean postparsed=false; | |
1208 public static boolean KILL_OK=false; | |
1209 public static boolean EXACT_ANI=true; | |
1210 public static boolean useSSA=true; | |
1211 public static boolean useSSA3=false; | |
1212 | |
1213 //Needs to be last due to dependencies. | |
1214 public static DisplayParams defaultParams=new DisplayParams(); | |
1215 | |
1216 static AlignmentThreadPool alignerPool=null; | |
1217 | |
1218 public static boolean verbose2=false; | |
1219 public static boolean LOADER2=true; | |
1220 | |
1221 } |