Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/icecream/IceCreamMaker.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package icecream; | |
2 | |
3 import java.io.PrintStream; | |
4 import java.util.ArrayList; | |
5 import java.util.Arrays; | |
6 import java.util.Random; | |
7 import java.util.concurrent.atomic.AtomicLong; | |
8 | |
9 import dna.AminoAcid; | |
10 import fileIO.ByteFile; | |
11 import fileIO.ByteStreamWriter; | |
12 import fileIO.FileFormat; | |
13 import fileIO.ReadWrite; | |
14 import shared.Parse; | |
15 import shared.Parser; | |
16 import shared.PreParser; | |
17 import shared.ReadStats; | |
18 import shared.Shared; | |
19 import shared.Timer; | |
20 import shared.Tools; | |
21 import stream.ConcurrentReadInputStream; | |
22 import stream.ConcurrentReadOutputStream; | |
23 import stream.FASTQ; | |
24 import stream.FastaReadInputStream; | |
25 import stream.Read; | |
26 import stream.SamLine; | |
27 import structures.ByteBuilder; | |
28 import structures.IntList; | |
29 import structures.ListNum; | |
30 | |
31 /** | |
32 * Generates chimeric PacBio reads containing inverted repeats | |
33 * due to missing adapters. | |
34 * | |
35 * @author Brian Bushnell | |
36 * @date June 8, 2019 | |
37 * | |
38 */ | |
39 public class IceCreamMaker { | |
40 | |
41 /*--------------------------------------------------------------*/ | |
42 /*---------------- Initialization ----------------*/ | |
43 /*--------------------------------------------------------------*/ | |
44 | |
45 /** | |
46 * Code entrance from the command line. | |
47 * @param args Command line arguments | |
48 */ | |
49 public static void main(String[] args){ | |
50 //Start a timer immediately upon code entrance. | |
51 Timer t=new Timer(); | |
52 | |
53 //Create an instance of this class | |
54 IceCreamMaker x=new IceCreamMaker(args); | |
55 | |
56 //Run the object | |
57 x.process(t); | |
58 | |
59 //Close the print stream if it was redirected | |
60 Shared.closeStream(x.outstream); | |
61 } | |
62 | |
63 /** | |
64 * Constructor. | |
65 * @param args Command line arguments | |
66 */ | |
67 public IceCreamMaker(String[] args){ | |
68 | |
69 {//Preparse block for help, config files, and outstream | |
70 PreParser pp=new PreParser(args, getClass(), false); | |
71 args=pp.args; | |
72 outstream=pp.outstream; | |
73 } | |
74 | |
75 //Set shared static variables prior to parsing | |
76 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; | |
77 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); | |
78 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false; | |
79 Shared.FAKE_QUAL=8; | |
80 // Shared.FASTA_WRAP=511; | |
81 SamLine.SET_FROM_OK=true; | |
82 | |
83 {//Parse the arguments | |
84 final Parser parser=parse(args); | |
85 Parser.processQuality(); | |
86 | |
87 maxReads=parser.maxReads; | |
88 overwrite=ReadStats.overwrite=parser.overwrite; | |
89 append=ReadStats.append=parser.append; | |
90 | |
91 in1=parser.in1; | |
92 extin=parser.extin; | |
93 | |
94 out1=parser.out1; | |
95 qfout1=parser.qfout1; | |
96 extout=parser.extout; | |
97 } | |
98 | |
99 validateParams(); | |
100 fixExtensions(); //Add or remove .gz or .bz2 as needed | |
101 checkFileExistence(); //Ensure files can be read and written | |
102 checkStatics(); //Adjust file-related static fields as needed for this program | |
103 | |
104 //Create output FileFormat objects | |
105 ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false); | |
106 | |
107 //Create input FileFormat objects | |
108 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); | |
109 | |
110 ffIdHist=FileFormat.testOutput(outIdHist, FileFormat.TXT, null, false, overwrite, append, false); | |
111 | |
112 insThresh=insFraction; | |
113 delThresh=delFraction+insThresh; | |
114 } | |
115 | |
116 /*--------------------------------------------------------------*/ | |
117 /*---------------- Initialization Helpers ----------------*/ | |
118 /*--------------------------------------------------------------*/ | |
119 | |
120 /** Parse arguments from the command line */ | |
121 private Parser parse(String[] args){ | |
122 | |
123 //Create a parser object | |
124 Parser parser=new Parser(); | |
125 | |
126 //Set any necessary Parser defaults here | |
127 //parser.foo=bar; | |
128 | |
129 //Parse each argument | |
130 for(int i=0; i<args.length; i++){ | |
131 String arg=args[i]; | |
132 | |
133 //Break arguments into their constituent parts, in the form of "a=b" | |
134 String[] split=arg.split("="); | |
135 String a=split[0].toLowerCase(); | |
136 String b=split.length>1 ? split[1] : null; | |
137 if(b!=null && b.equalsIgnoreCase("null")){b=null;} | |
138 | |
139 if(a.equals("verbose")){ | |
140 verbose=Parse.parseBoolean(b); | |
141 } | |
142 | |
143 else if(a.equals("idhist")){ | |
144 outIdHist=b; | |
145 } | |
146 | |
147 else if(a.equals("minlength") || a.equals("minlen")){ | |
148 minMoleculeLength=Parse.parseIntKMG(b); | |
149 }else if(a.equals("maxlength") || a.equals("maxlen")){ | |
150 maxMoleculeLength=Parse.parseIntKMG(b); | |
151 }else if(a.equals("length") || a.equals("len")){ | |
152 minMoleculeLength=maxMoleculeLength=Parse.parseIntKMG(b); | |
153 } | |
154 | |
155 else if(a.equals("minmovie") || a.equals("minmov")){ | |
156 minMovie=Parse.parseIntKMG(b); | |
157 }else if(a.equals("maxmovie") || a.equals("maxmov")){ | |
158 maxMovie=Parse.parseIntKMG(b); | |
159 }else if(a.equals("movie") || a.equals("mov")){ | |
160 minMovie=maxMovie=Parse.parseIntKMG(b); | |
161 } | |
162 | |
163 else if(a.equals("missingrate") || a.equals("missing")){ | |
164 missingRate=Double.parseDouble(b); | |
165 assert(missingRate<=1); | |
166 }else if(a.equals("hiddenrate") || a.equals("hidden")){ | |
167 hiddenRate=Double.parseDouble(b); | |
168 assert(hiddenRate<=1); | |
169 }else if(a.equals("bothends")){ | |
170 allowBothEndsBad=Parse.parseBoolean(b); | |
171 assert(false) : "TODO"; | |
172 } | |
173 | |
174 else if(a.equals("gc")){ | |
175 genomeGC=(float)Double.parseDouble(b); | |
176 assert(genomeGC>=0 && genomeGC<=1); | |
177 }else if(a.equals("genomesize")){ | |
178 genomeSize=Parse.parseKMG(b); | |
179 }else if(a.equals("addns") || a.equals("ns")){ | |
180 addNs=Parse.parseBoolean(b); | |
181 }else if(a.equals("seed")){ | |
182 seed=Long.parseLong(b); | |
183 }else if(a.equals("zmws") || a.equals("maxzmws") || a.equals("reads")){ | |
184 maxZMWs=Parse.parseIntKMG(b); | |
185 }else if(a.equals("ccs")){ | |
186 makeCCS=Parse.parseBoolean(b); | |
187 } | |
188 | |
189 else if(a.equals("invertedrepeatrate") || a.equals("invertrepeatrate") || a.equals("irrate")){ | |
190 invertedRepeatRate=Double.parseDouble(b); | |
191 assert(invertedRepeatRate>=0); | |
192 }else if(a.equals("invertedrepeatminlen") || a.equals("invertrepeatminlen") || a.equals("irminlen")){ | |
193 invertedRepeatMinLength=Parse.parseIntKMG(b); | |
194 }else if(a.equals("invertedrepeatmaxlen") || a.equals("invertrepeatmaxlen") || a.equals("irmaxlen")){ | |
195 invertedRepeatMaxLength=Parse.parseIntKMG(b); | |
196 }else if(a.equals("invertedrepeatlen") || a.equals("invertrepeatlen") || a.equals("irlen")){ | |
197 invertedRepeatMinLength=invertedRepeatMaxLength=Parse.parseIntKMG(b); | |
198 } | |
199 | |
200 else if(a.equals("miner") || a.equals("minerrorrate")){ | |
201 minErrorRate=(float)Double.parseDouble(b); | |
202 assert(minErrorRate>=0 && minErrorRate<=1); | |
203 }else if(a.equals("maxer") || a.equals("maxerrorrate")){ | |
204 maxErrorRate=(float)Double.parseDouble(b); | |
205 assert(maxErrorRate>=0 && maxErrorRate<=1); | |
206 }else if(a.equals("er") || a.equals("errorrate")){ | |
207 minErrorRate=maxErrorRate=(float)Double.parseDouble(b); | |
208 assert(minErrorRate>=0 && minErrorRate<=1); | |
209 } | |
210 | |
211 else if(a.equals("minid") || a.equals("minidentity")){ | |
212 maxErrorRate=1-(float)Double.parseDouble(b); | |
213 assert(maxErrorRate>=0 && maxErrorRate<=1); | |
214 }else if(a.equals("maxid") || a.equals("maxidentity")){ | |
215 minErrorRate=1-(float)Double.parseDouble(b); | |
216 assert(minErrorRate>=0 && minErrorRate<=1); | |
217 }else if(a.equals("id") || a.equals("identity")){ | |
218 minErrorRate=maxErrorRate=1-(float)Double.parseDouble(b); | |
219 assert(minErrorRate>=0 && minErrorRate<=1); | |
220 }else if(a.equals("adderrors")){ | |
221 addErrors=Parse.parseBoolean(b); | |
222 }else if(a.equals("ref")){ | |
223 parser.in1=b; | |
224 } | |
225 | |
226 else if(a.equals("parse_flag_goes_here")){ | |
227 long fake_variable=Parse.parseKMG(b); | |
228 //Set a variable here | |
229 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser | |
230 //do nothing | |
231 }else{ | |
232 outstream.println("Unknown parameter "+args[i]); | |
233 assert(false) : "Unknown parameter "+args[i]; | |
234 } | |
235 } | |
236 | |
237 return parser; | |
238 } | |
239 | |
240 /** Add or remove .gz or .bz2 as needed */ | |
241 private void fixExtensions(){ | |
242 in1=Tools.fixExtension(in1); | |
243 } | |
244 | |
245 /** Ensure files can be read and written */ | |
246 private void checkFileExistence(){ | |
247 //Ensure output files can be written | |
248 if(!Tools.testOutputFiles(overwrite, append, false, out1, outIdHist)){ | |
249 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n"); | |
250 } | |
251 | |
252 //Ensure input files can be read | |
253 if(!Tools.testInputFiles(false, true, in1)){ | |
254 throw new RuntimeException("\nCan't read some input files.\n"); | |
255 } | |
256 | |
257 //Ensure that no file was specified multiple times | |
258 if(!Tools.testForDuplicateFiles(true, in1, out1, outIdHist)){ | |
259 throw new RuntimeException("\nSome file names were specified multiple times.\n"); | |
260 } | |
261 } | |
262 | |
263 /** Adjust file-related static fields as needed for this program */ | |
264 private static void checkStatics(){ | |
265 //Adjust the number of threads for input file reading | |
266 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ | |
267 ByteFile.FORCE_MODE_BF2=true; | |
268 } | |
269 | |
270 assert(FastaReadInputStream.settingsOK()); | |
271 } | |
272 | |
273 /** Ensure parameter ranges are within bounds and required parameters are set */ | |
274 private boolean validateParams(){ | |
275 assert(minMoleculeLength>0 && minMoleculeLength<=maxMoleculeLength) : minMoleculeLength+", "+maxMoleculeLength; | |
276 assert(minMovie>0 && minMovie<=maxMovie) : minMovie+", "+maxMovie; | |
277 | |
278 assert(missingRate>=0 && missingRate<=1) : missingRate; | |
279 assert(hiddenRate>=0 && hiddenRate<=1) : hiddenRate; | |
280 assert(genomeGC>=0 && genomeGC<=1) : genomeGC; | |
281 assert(in1!=null || genomeSize>maxMoleculeLength) : genomeSize; | |
282 assert(in1!=null || invertedRepeatMaxLength*2<genomeSize) : genomeSize; | |
283 assert(maxZMWs>0) : "zmsw="+maxZMWs+"; please set to a positive number."; | |
284 | |
285 assert(invertedRepeatRate>=0) : invertedRepeatRate; | |
286 assert(invertedRepeatMinLength>0 && invertedRepeatMinLength<=invertedRepeatMaxLength) : invertedRepeatMinLength+", "+invertedRepeatMaxLength; | |
287 | |
288 assert(minErrorRate>=0 && minErrorRate<=maxErrorRate) : minErrorRate+", "+maxErrorRate; | |
289 return true; | |
290 } | |
291 | |
292 /*--------------------------------------------------------------*/ | |
293 /*---------------- Outer Methods ----------------*/ | |
294 /*--------------------------------------------------------------*/ | |
295 | |
296 /** Create read streams and process all data */ | |
297 void process(Timer t){ | |
298 | |
299 //Turn off read validation in the input threads to increase speed | |
300 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; | |
301 Read.VALIDATE_IN_CONSTRUCTOR=true; | |
302 | |
303 //Create a read input stream | |
304 final ConcurrentReadInputStream cris=makeCris(); | |
305 | |
306 //Optionally create a read output stream | |
307 final ConcurrentReadOutputStream ros=makeCros(false); | |
308 | |
309 //Reset counters | |
310 readsProcessed=readsOut=0; | |
311 basesProcessed=basesOut=0; | |
312 | |
313 Random randy=Shared.threadLocalRandom(seed); | |
314 if(cris==null){ | |
315 ref=genSynthGenome(randy); | |
316 }else{ | |
317 ref=loadData(cris, randy); | |
318 } | |
319 | |
320 if(invertedRepeatRate>0){ | |
321 addInvertedRepeats(ref, randy); | |
322 } | |
323 | |
324 //Process the reads in separate threads | |
325 spawnThreads(cris, ros); | |
326 | |
327 if(verbose){outstream.println("Finished; closing streams.");} | |
328 | |
329 //Write anything that was accumulated by ReadStats | |
330 errorState|=ReadStats.writeAll(); | |
331 //Close the read streams | |
332 errorState|=ReadWrite.closeStreams(cris, ros); | |
333 | |
334 writeIdHist(); | |
335 | |
336 //Reset read validation | |
337 Read.VALIDATE_IN_CONSTRUCTOR=vic; | |
338 | |
339 //Report timing and results | |
340 t.stop(); | |
341 outstream.println(Tools.timeReadsBasesProcessed(t, readsOut, basesOut, 8)); | |
342 // outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); | |
343 | |
344 //Throw an exception of there was an error in a thread | |
345 if(errorState){ | |
346 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); | |
347 } | |
348 } | |
349 | |
350 private void writeIdHist(){ | |
351 if(ffIdHist==null){return;} | |
352 ByteStreamWriter bsw=new ByteStreamWriter(ffIdHist); | |
353 bsw.start(); | |
354 bsw.print("#Identity\tCount\n".getBytes()); | |
355 for(int i=0; i<idHist.length; i++){ | |
356 bsw.print(i*100.0/(ID_BINS-1), 3).print('\t').println(idHist[i]); | |
357 } | |
358 errorState|=bsw.poisonAndWait(); | |
359 } | |
360 | |
361 /** Create a Read Input Stream */ | |
362 private ConcurrentReadInputStream makeCris(){ | |
363 if(ffin1==null){return null;} | |
364 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null); | |
365 cris.start(); //Start the stream | |
366 if(verbose){outstream.println("Started cris");} | |
367 return cris; | |
368 } | |
369 | |
370 /** Create a Read Output Stream */ | |
371 private ConcurrentReadOutputStream makeCros(boolean pairedInput){ | |
372 if(ffout1==null){return null;} | |
373 | |
374 //Set output buffer size | |
375 final int buff=4; | |
376 | |
377 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream( | |
378 ffout1, null, qfout1, null, buff, null, false); | |
379 ros.start(); //Start the stream | |
380 return ros; | |
381 } | |
382 | |
383 /** Spawn process threads */ | |
384 private void spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){ | |
385 | |
386 //Do anything necessary prior to processing | |
387 | |
388 //Determine how many threads may be used | |
389 final int threads=Shared.threads(); | |
390 | |
391 //Fill a list with ProcessThreads | |
392 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); | |
393 for(int i=0; i<threads; i++){ | |
394 alpt.add(new ProcessThread(ros, i, nextZmwID, seed)); | |
395 } | |
396 | |
397 //Start the threads | |
398 for(ProcessThread pt : alpt){ | |
399 pt.start(); | |
400 } | |
401 | |
402 //Wait for threads to finish | |
403 waitForThreads(alpt); | |
404 | |
405 //Do anything necessary after processing | |
406 | |
407 } | |
408 | |
409 /** Wait until all worker threads are finished, then return */ | |
410 private void waitForThreads(ArrayList<ProcessThread> alpt){ | |
411 | |
412 //Wait for completion of all threads | |
413 boolean success=true; | |
414 for(ProcessThread pt : alpt){ | |
415 | |
416 //Wait until this thread has terminated | |
417 while(pt.getState()!=Thread.State.TERMINATED){ | |
418 try { | |
419 //Attempt a join operation | |
420 pt.join(); | |
421 } catch (InterruptedException e) { | |
422 //Potentially handle this, if it is expected to occur | |
423 e.printStackTrace(); | |
424 } | |
425 } | |
426 | |
427 //Accumulate per-thread statistics | |
428 readsOut+=pt.readsOutT; | |
429 basesOut+=pt.basesOutT; | |
430 Tools.add(idHist, pt.idHistT); | |
431 | |
432 success&=pt.success; | |
433 } | |
434 | |
435 //Track whether any threads failed | |
436 if(!success){errorState=true;} | |
437 } | |
438 | |
439 /*--------------------------------------------------------------*/ | |
440 /*---------------- Inner Methods ----------------*/ | |
441 /*--------------------------------------------------------------*/ | |
442 | |
443 private byte randomBase(Random randy) { | |
444 float rGC=randy.nextFloat(); | |
445 if(rGC>=genomeGC){//AT | |
446 return (byte)(randy.nextBoolean() ? 'A' : 'T'); | |
447 }else{//GC | |
448 return (byte)(randy.nextBoolean() ? 'G' : 'C'); | |
449 } | |
450 } | |
451 | |
452 private static int randomLength(int min, int max, Random randy) { | |
453 if(min==max){return min;} | |
454 int range=max-min+1; | |
455 int x=min+Tools.min(randy.nextInt(range), randy.nextInt(range)); | |
456 // System.err.println(x+", "+min+", "+max); | |
457 // new Exception().printStackTrace(); | |
458 // System.err.println(randy.getClass()); | |
459 // System.err.println(randy.nextLong()); | |
460 return x; | |
461 } | |
462 | |
463 private static float randomRate(float min, float max, Random randy) { | |
464 if(min==max){return min;} | |
465 float range=max-min; | |
466 // float a=(randy.nextFloat()+randy.nextFloat()); | |
467 float b=(randy.nextFloat()+randy.nextFloat()); | |
468 float c=(1.6f*randy.nextFloat()+0.4f*randy.nextFloat()); | |
469 float x=min+range*0.5f*Tools.min(b, c); | |
470 // assert(false) : "x="+x+", a="+a+", b="+b+", c="+c+", min="+min+", max="+max; | |
471 // System.err.println(x+", "+min+", "+max); | |
472 return x; | |
473 } | |
474 | |
475 private byte[] genSynthGenome(Random randy){ | |
476 assert(genomeSize<=MAX_GENOME_LENGTH) : genomeSize; | |
477 byte[] array=new byte[(int)genomeSize]; | |
478 for(int i=0; i<genomeSize; i++){ | |
479 array[i]=randomBase(randy); | |
480 } | |
481 return array; | |
482 } | |
483 | |
484 private byte[] loadData(ConcurrentReadInputStream cris, Random randy){ | |
485 | |
486 ByteBuilder bb=new ByteBuilder(); | |
487 | |
488 //Grab the first ListNum of reads | |
489 ListNum<Read> ln=cris.nextList(); | |
490 | |
491 //As long as there is a nonempty read list... | |
492 while(ln!=null && ln.size()>0){ | |
493 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access | |
494 | |
495 for(Read r : ln){ | |
496 | |
497 // System.err.println("Fetched read len="+r.length());//123 | |
498 | |
499 //Increment counters | |
500 readsProcessed+=r.pairCount(); | |
501 basesProcessed+=r.pairLength(); | |
502 | |
503 | |
504 | |
505 // System.err.println("Filter: addNs="+addNs+", (r.length()>maxMoleculeLength && (invertedRepeatRate<=0 || r.length()>2*invertedRepeatMaxLength);//123 | |
506 | |
507 //Filter disabled; it causes short sequences to be ignored. | |
508 //Optional filter criteria | |
509 // if(!addNs || (r.length()>maxMoleculeLength && (invertedRepeatRate<=0 || r.length()>2*invertedRepeatMaxLength))){ | |
510 final byte[] bases=r.bases; | |
511 | |
512 if(addNs){ | |
513 if(bb.length()>0){bb.append('N');} | |
514 }else{ | |
515 for(int i=0; i<bases.length; i++){ | |
516 if(!AminoAcid.isFullyDefined(bases[i])){ | |
517 bases[i]=randomBase(randy); | |
518 } | |
519 } | |
520 } | |
521 | |
522 if(bb.length()+bases.length<=MAX_GENOME_LENGTH){ | |
523 bb.append(bases); | |
524 // System.err.println("Appended "+r.length());//123 | |
525 }else{ | |
526 // System.err.println("Appending partial.");//123 | |
527 for(byte b : bases){ | |
528 bb.append(b); | |
529 if(bb.length>=MAX_GENOME_LENGTH){ | |
530 cris.returnList(ln.id, true); | |
531 // System.err.println("Returning partial "+bb.length);//123 | |
532 return bb.toBytes(); | |
533 } | |
534 } | |
535 } | |
536 // } | |
537 } | |
538 | |
539 //Notify the input stream that the list was used | |
540 cris.returnList(ln); | |
541 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access | |
542 | |
543 //Fetch a new list | |
544 ln=cris.nextList(); | |
545 } | |
546 | |
547 //Notify the input stream that the final list was used | |
548 if(ln!=null){ | |
549 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); | |
550 } | |
551 | |
552 // System.err.println("Returning full "+bb.length);//123 | |
553 return bb.toBytes(); | |
554 } | |
555 | |
556 private void addInvertedRepeats(byte[] bases, Random randy){ | |
557 | |
558 long added=0; | |
559 long toAdd=(long) (bases.length*invertedRepeatRate); | |
560 while(added<toAdd){ | |
561 int len=randomLength(invertedRepeatMinLength, invertedRepeatMaxLength, randy); | |
562 int start=randy.nextInt(bases.length-2*len); | |
563 int stop=start+len; | |
564 boolean OK=true; | |
565 for(int i=0; i<len && OK; i++){ | |
566 OK&=(bases[start+i]!='N' && bases[stop+i]!='N'); | |
567 } | |
568 if(OK){ | |
569 for(int i=0; i<len; i++){ | |
570 byte b=bases[stop-i-1]; | |
571 bases[stop+i]=AminoAcid.baseToComplementExtended[b]; | |
572 } | |
573 added+=2*len; | |
574 // System.err.println("added="+added+"/"+toAdd); | |
575 }else{ | |
576 // | |
577 } | |
578 } | |
579 | |
580 } | |
581 | |
582 /*--------------------------------------------------------------*/ | |
583 /*---------------- Inner Classes ----------------*/ | |
584 /*--------------------------------------------------------------*/ | |
585 | |
586 /** */ | |
587 private class ProcessThread extends Thread { | |
588 | |
589 //Constructor | |
590 ProcessThread(final ConcurrentReadOutputStream ros_, final int tid_, | |
591 final AtomicLong nextZmwID_, final long seed){ | |
592 ros=ros_; | |
593 tid=tid_; | |
594 atomicZmwID=nextZmwID_; | |
595 // assert(false) : randy.getClass()+", "+randy.nextLong(); | |
596 } | |
597 | |
598 //Called by start() | |
599 @Override | |
600 public void run(){ | |
601 //Do anything necessary prior to processing | |
602 randy=Shared.threadLocalRandom(seed<0 ? seed : seed+(tid+1)*tid*1000L); | |
603 | |
604 //Process the reads | |
605 processInner(); | |
606 | |
607 //Do anything necessary after processing | |
608 | |
609 //Indicate successful exit status | |
610 success=true; | |
611 } | |
612 | |
613 /** Iterate through the reads */ | |
614 void processInner(){ | |
615 | |
616 //As long as there is a nonempty read list... | |
617 for(long generated=atomicZmwID.getAndAdd(readsPerList); generated<maxZMWs; | |
618 generated=atomicZmwID.getAndAdd(readsPerList)){ | |
619 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access | |
620 | |
621 long toGenerate=Tools.min(readsPerList, maxZMWs-generated); | |
622 | |
623 ArrayList<Read> reads=generateList((int)toGenerate, generated); | |
624 | |
625 //Output reads to the output stream | |
626 if(ros!=null){ros.add(reads, 0);} | |
627 } | |
628 } | |
629 | |
630 /** Generate the next list of reads */ | |
631 private ArrayList<Read> generateList(int toGenerate, long nextID){ | |
632 | |
633 //Grab the actual read list from the ListNum | |
634 final ArrayList<Read> reads=new ArrayList<Read>(toGenerate); | |
635 | |
636 //Loop through each read in the list | |
637 for(int i=0; i<toGenerate; i++, nextID++){ | |
638 ArrayList<Read> zmw=generateZMW(nextID); | |
639 if(zmw==null){ | |
640 i--; | |
641 nextID--; | |
642 }else{reads.addAll(zmw);} | |
643 } | |
644 | |
645 return reads; | |
646 } | |
647 | |
648 private ReadBuilder median(ArrayList<ReadBuilder> list){ | |
649 if(list.size()<3){return null;} | |
650 IntList lengths=new IntList(list.size()-2); | |
651 | |
652 for(int i=1; i<list.size()-1; i++){ | |
653 lengths.add(list.get(i).length()); | |
654 } | |
655 lengths.sort(); | |
656 int median=lengths.get((lengths.size-1)/2); | |
657 | |
658 for(int i=1; i<list.size()-1; i++){ | |
659 ReadBuilder rb=list.get(i); | |
660 if(rb.length()==median){ | |
661 return rb; | |
662 } | |
663 } | |
664 | |
665 assert(false); | |
666 return null; | |
667 } | |
668 | |
669 /** | |
670 * Generate a single read. | |
671 */ | |
672 private ArrayList<Read> generateZMW(final long nextID){ | |
673 | |
674 final int movieLength=randomLength(minMovie, maxMovie, randy); | |
675 final float errorRate=randomRate(minErrorRate, maxErrorRate, randy); | |
676 | |
677 ArrayList<ReadBuilder> baseCalls=baseCallAllPasses(movieLength, errorRate, nextID); | |
678 if(baseCalls==null){ | |
679 // System.err.println(movieLength+", "+errorRate+", "+nextID);//123 | |
680 return null; | |
681 } | |
682 | |
683 final boolean missing=randy.nextFloat()<missingRate; | |
684 if(missing){ | |
685 final int missingMod=randy.nextInt(2); | |
686 ArrayList<ReadBuilder> temp=new ArrayList<ReadBuilder>(); | |
687 | |
688 ReadBuilder current=null; | |
689 for(int i=0; i<baseCalls.size(); i++){ | |
690 ReadBuilder rb=baseCalls.get(i); | |
691 assert(rb.subreads==1); | |
692 assert(rb.missing==0); | |
693 assert(rb.adapters==0); | |
694 if(current!=null && (i&1)==missingMod){ | |
695 current.add(rb); | |
696 current.missing++; | |
697 assert(current.subreads>1); | |
698 assert(current.missing>0); | |
699 temp.add(current); | |
700 current=null; | |
701 }else{ | |
702 if(current!=null){temp.add(current);} | |
703 current=rb; | |
704 } | |
705 } | |
706 if(current!=null){temp.add(current);} | |
707 baseCalls=temp; | |
708 } | |
709 | |
710 if(makeCCS){ | |
711 ReadBuilder median=median(baseCalls); | |
712 if(median==null){return null;} | |
713 baseCalls.clear(); | |
714 baseCalls.add(median); | |
715 }else if(hiddenRate>0){ | |
716 ArrayList<ReadBuilder> temp=new ArrayList<ReadBuilder>(); | |
717 | |
718 ReadBuilder current=null; | |
719 for(int i=0; i<baseCalls.size(); i++){ | |
720 ReadBuilder rb=baseCalls.get(i); | |
721 assert(rb.adapters==0); | |
722 if(current!=null && randy.nextFloat()<hiddenRate){ | |
723 current.add(baseCallAdapter(errorRate)); | |
724 current.add(rb); | |
725 assert(current.adapters>0); | |
726 assert(current.subreads>1); | |
727 }else{ | |
728 if(current!=null){temp.add(current);} | |
729 current=rb; | |
730 assert(current.adapters==0); | |
731 } | |
732 } | |
733 if(current!=null){temp.add(current);} | |
734 baseCalls=temp; | |
735 } | |
736 | |
737 | |
738 ArrayList<Read> reads=new ArrayList<Read>(); | |
739 for(ReadBuilder rb : baseCalls){ | |
740 Read r=rb.toRead(); | |
741 readsOutT+=r.pairCount(); | |
742 basesOutT+=r.length(); | |
743 reads.add(r); | |
744 } | |
745 | |
746 idHistT[(int)((1-errorRate)*(ID_BINS-1)+0.5f)]++; | |
747 return reads; | |
748 } | |
749 | |
750 private ArrayList<ReadBuilder> baseCallAllPasses(final int movieLength, final float errorRate, long zmw){ | |
751 byte[] frag=null; | |
752 | |
753 for(int i=0; i<10 && frag==null; i++){//retry several times | |
754 frag=fetchBases(ref); | |
755 } | |
756 | |
757 if(frag==null){ | |
758 // System.err.println("Failed baseCallAllPasses("+movieLength+", "+errorRate+", "+zmw);//123 | |
759 return null; | |
760 } | |
761 | |
762 ArrayList<ReadBuilder> list=new ArrayList<ReadBuilder>(); | |
763 int movieRemaining=movieLength; | |
764 int moviePos=0; | |
765 assert(frag.length>0) : frag.length; | |
766 int start=randy.nextInt(frag.length); | |
767 while(movieRemaining>0){ | |
768 ReadBuilder rb=baseCallOnePass(frag, errorRate, start, moviePos, movieRemaining, zmw); | |
769 list.add(rb); | |
770 start=0; | |
771 int elapsed=rb.length()+adapterLen; | |
772 moviePos+=elapsed; | |
773 movieRemaining-=elapsed; | |
774 AminoAcid.reverseComplementBasesInPlace(frag); | |
775 } | |
776 return list; | |
777 } | |
778 | |
779 /** Call bases for one pass */ | |
780 private ReadBuilder baseCallOnePass(final byte[] frag, final float errorRate, final int start, final int movieStart, int movieRemaining, long zmw){ | |
781 final float mult=1/(1-errorRate); | |
782 ByteBuilder bb=new ByteBuilder(); | |
783 int fpos=start; | |
784 for(; fpos<frag.length && movieRemaining>0; fpos++){ | |
785 float f=randy.nextFloat(); | |
786 byte b=frag[fpos]; | |
787 if(f>=errorRate){ | |
788 bb.append(b); | |
789 movieRemaining--; | |
790 }else{ | |
791 f=mult*(1-f); | |
792 if(f<insThresh){//ins | |
793 b=AminoAcid.numberToBase[randy.nextInt(4)]; | |
794 bb.append(b); | |
795 fpos--; | |
796 movieRemaining--; | |
797 }else if(f<delThresh){//del | |
798 | |
799 }else{//sub | |
800 int x=AminoAcid.baseToNumber[b]+randy.nextInt(3)+1; | |
801 b=AminoAcid.numberToBase[x&3]; | |
802 bb.append(b); | |
803 movieRemaining--; | |
804 } | |
805 } | |
806 } | |
807 | |
808 float passes=(fpos-start)*1.0f/frag.length; | |
809 ReadBuilder rb=new ReadBuilder(bb, passes, movieStart, zmw); | |
810 rb.errorRate=errorRate; | |
811 return rb; | |
812 } | |
813 | |
814 /** Call bases for an adapter sequence pass */ | |
815 private ReadBuilder baseCallAdapter(final float errorRate){ | |
816 ReadBuilder rb=baseCallOnePass(pacbioAdapter, errorRate, 0, 0, 999999, 0); | |
817 rb.passes=0; | |
818 rb.fullPasses=0; | |
819 rb.subreads=0; | |
820 rb.adapters=1; | |
821 return rb; | |
822 } | |
823 | |
824 private byte[] fetchBases(byte[] source){ | |
825 | |
826 final int len=randomLength(minMoleculeLength, maxMoleculeLength, randy); | |
827 int start=len>=source.length ? 0 : randy.nextInt(source.length-len); | |
828 int stop=Tools.min(start+len, source.length); | |
829 | |
830 // System.err.println("fetchBases(len="+len+", slen="+source.length+", start="+start+", stop="+stop);//123 | |
831 | |
832 for(int i=start; i<stop; i++){ | |
833 if(!AminoAcid.isFullyDefined(source[i])){ | |
834 return null; | |
835 } | |
836 } | |
837 if(stop-start<1){return null;} | |
838 byte[] frag=Arrays.copyOfRange(source, start, stop); | |
839 if(randy.nextBoolean()){AminoAcid.reverseComplementBasesInPlace(frag);} | |
840 return frag; | |
841 } | |
842 | |
843 /** Number of reads retained by this thread */ | |
844 protected long readsOutT=0; | |
845 /** Number of bases retained by this thread */ | |
846 protected long basesOutT=0; | |
847 | |
848 /** True only if this thread has completed successfully */ | |
849 boolean success=false; | |
850 | |
851 private final AtomicLong atomicZmwID; | |
852 private final int readsPerList=Shared.bufferLen(); | |
853 | |
854 /** Random number source */ | |
855 private Random randy; | |
856 | |
857 /** Random number source */ | |
858 private final long[] idHistT=new long[ID_BINS]; | |
859 | |
860 /** Shared output stream */ | |
861 private final ConcurrentReadOutputStream ros; | |
862 /** Thread ID */ | |
863 final int tid; | |
864 } | |
865 | |
866 /*--------------------------------------------------------------*/ | |
867 | |
868 | |
869 | |
870 /*--------------------------------------------------------------*/ | |
871 /*---------------- Fields ----------------*/ | |
872 /*--------------------------------------------------------------*/ | |
873 | |
874 /** Primary input file path */ | |
875 private String in1=null; | |
876 | |
877 /** Primary output file path */ | |
878 private String out1=null; | |
879 | |
880 /** Primary output file path */ | |
881 private String outIdHist=null; | |
882 | |
883 private String qfout1=null; | |
884 | |
885 /** Override input file extension */ | |
886 private String extin=null; | |
887 /** Override output file extension */ | |
888 private String extout=null; | |
889 | |
890 /*--------------------------------------------------------------*/ | |
891 | |
892 /** */ | |
893 private int minMoleculeLength=500; | |
894 /** */ | |
895 private int maxMoleculeLength=10000; | |
896 /** */ | |
897 private int minMovie=500; | |
898 /** */ | |
899 private int maxMovie=40000; | |
900 | |
901 /** */ | |
902 private double missingRate=0.0; | |
903 /** */ | |
904 private double hiddenRate=0.0; | |
905 /** */ | |
906 private boolean allowBothEndsBad=false; | |
907 | |
908 /** */ | |
909 private float genomeGC=0.6f; | |
910 /** */ | |
911 private long genomeSize=10000000; | |
912 /** */ | |
913 private boolean addNs=true; | |
914 | |
915 /** */ | |
916 private double invertedRepeatRate=0.0; | |
917 /** */ | |
918 private int invertedRepeatMinLength=100; | |
919 /** */ | |
920 private int invertedRepeatMaxLength=10000; | |
921 | |
922 /** */ | |
923 private float minErrorRate=0.05f; | |
924 /** */ | |
925 private float maxErrorRate=0.25f; | |
926 /** */ | |
927 private boolean addErrors=true; | |
928 /** One read per ZMW */ | |
929 private boolean makeCCS=false; | |
930 | |
931 /** */ | |
932 private long seed=-1; | |
933 | |
934 private long[] idHist=new long[ID_BINS]; | |
935 | |
936 //These should add to 1 | |
937 private float insFraction=0.40f; | |
938 private float delFraction=0.35f; | |
939 private float subFraction=0.25f; | |
940 | |
941 private final float insThresh; | |
942 private final float delThresh; | |
943 | |
944 /*--------------------------------------------------------------*/ | |
945 | |
946 /** Number of reads processed */ | |
947 protected long readsProcessed=0; | |
948 /** Number of bases processed */ | |
949 protected long basesProcessed=0; | |
950 | |
951 /** Number of reads retained */ | |
952 protected long readsOut=0; | |
953 /** Number of bases retained */ | |
954 protected long basesOut=0; | |
955 | |
956 /** Quit after processing this many INPUT reads */ | |
957 private long maxReads=-1; | |
958 | |
959 /** Quit after generating this many OUTPUT zmws */ | |
960 private long maxZMWs=-1; | |
961 | |
962 /** Reference genome, max 2Gbp */ | |
963 private byte[] ref; | |
964 | |
965 private AtomicLong nextZmwID=new AtomicLong(0); | |
966 | |
967 /*--------------------------------------------------------------*/ | |
968 /*---------------- Final Fields ----------------*/ | |
969 /*--------------------------------------------------------------*/ | |
970 | |
971 /** Primary input file */ | |
972 private final FileFormat ffin1; | |
973 | |
974 /** Primary output file */ | |
975 private final FileFormat ffout1; | |
976 | |
977 /** Primary output file */ | |
978 private final FileFormat ffIdHist; | |
979 | |
980 /*--------------------------------------------------------------*/ | |
981 /*---------------- Constants ----------------*/ | |
982 /*--------------------------------------------------------------*/ | |
983 | |
984 private static final int ID_BINS=201; | |
985 | |
986 private static final long MAX_GENOME_LENGTH=2000000000; | |
987 | |
988 public static final byte[] pacbioAdapter="ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT".getBytes(); | |
989 public static final int adapterLen=pacbioAdapter.length; | |
990 | |
991 /*--------------------------------------------------------------*/ | |
992 /*---------------- Common Fields ----------------*/ | |
993 /*--------------------------------------------------------------*/ | |
994 | |
995 /** Print status messages to this output stream */ | |
996 private PrintStream outstream=System.err; | |
997 /** Print verbose messages */ | |
998 public static boolean verbose=false; | |
999 /** True if an error was encountered */ | |
1000 public boolean errorState=false; | |
1001 /** Overwrite existing output files */ | |
1002 private boolean overwrite=false; | |
1003 /** Append to existing output files */ | |
1004 private boolean append=false; | |
1005 | |
1006 } |