comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/icecream/IceCreamMaker.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package icecream;
2
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.Random;
7 import java.util.concurrent.atomic.AtomicLong;
8
9 import dna.AminoAcid;
10 import fileIO.ByteFile;
11 import fileIO.ByteStreamWriter;
12 import fileIO.FileFormat;
13 import fileIO.ReadWrite;
14 import shared.Parse;
15 import shared.Parser;
16 import shared.PreParser;
17 import shared.ReadStats;
18 import shared.Shared;
19 import shared.Timer;
20 import shared.Tools;
21 import stream.ConcurrentReadInputStream;
22 import stream.ConcurrentReadOutputStream;
23 import stream.FASTQ;
24 import stream.FastaReadInputStream;
25 import stream.Read;
26 import stream.SamLine;
27 import structures.ByteBuilder;
28 import structures.IntList;
29 import structures.ListNum;
30
31 /**
32 * Generates chimeric PacBio reads containing inverted repeats
33 * due to missing adapters.
34 *
35 * @author Brian Bushnell
36 * @date June 8, 2019
37 *
38 */
39 public class IceCreamMaker {
40
41 /*--------------------------------------------------------------*/
42 /*---------------- Initialization ----------------*/
43 /*--------------------------------------------------------------*/
44
45 /**
46 * Code entrance from the command line.
47 * @param args Command line arguments
48 */
49 public static void main(String[] args){
50 //Start a timer immediately upon code entrance.
51 Timer t=new Timer();
52
53 //Create an instance of this class
54 IceCreamMaker x=new IceCreamMaker(args);
55
56 //Run the object
57 x.process(t);
58
59 //Close the print stream if it was redirected
60 Shared.closeStream(x.outstream);
61 }
62
63 /**
64 * Constructor.
65 * @param args Command line arguments
66 */
67 public IceCreamMaker(String[] args){
68
69 {//Preparse block for help, config files, and outstream
70 PreParser pp=new PreParser(args, getClass(), false);
71 args=pp.args;
72 outstream=pp.outstream;
73 }
74
75 //Set shared static variables prior to parsing
76 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
77 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
78 FASTQ.TEST_INTERLEAVED=FASTQ.FORCE_INTERLEAVED=false;
79 Shared.FAKE_QUAL=8;
80 // Shared.FASTA_WRAP=511;
81 SamLine.SET_FROM_OK=true;
82
83 {//Parse the arguments
84 final Parser parser=parse(args);
85 Parser.processQuality();
86
87 maxReads=parser.maxReads;
88 overwrite=ReadStats.overwrite=parser.overwrite;
89 append=ReadStats.append=parser.append;
90
91 in1=parser.in1;
92 extin=parser.extin;
93
94 out1=parser.out1;
95 qfout1=parser.qfout1;
96 extout=parser.extout;
97 }
98
99 validateParams();
100 fixExtensions(); //Add or remove .gz or .bz2 as needed
101 checkFileExistence(); //Ensure files can be read and written
102 checkStatics(); //Adjust file-related static fields as needed for this program
103
104 //Create output FileFormat objects
105 ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
106
107 //Create input FileFormat objects
108 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
109
110 ffIdHist=FileFormat.testOutput(outIdHist, FileFormat.TXT, null, false, overwrite, append, false);
111
112 insThresh=insFraction;
113 delThresh=delFraction+insThresh;
114 }
115
116 /*--------------------------------------------------------------*/
117 /*---------------- Initialization Helpers ----------------*/
118 /*--------------------------------------------------------------*/
119
120 /** Parse arguments from the command line */
121 private Parser parse(String[] args){
122
123 //Create a parser object
124 Parser parser=new Parser();
125
126 //Set any necessary Parser defaults here
127 //parser.foo=bar;
128
129 //Parse each argument
130 for(int i=0; i<args.length; i++){
131 String arg=args[i];
132
133 //Break arguments into their constituent parts, in the form of "a=b"
134 String[] split=arg.split("=");
135 String a=split[0].toLowerCase();
136 String b=split.length>1 ? split[1] : null;
137 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
138
139 if(a.equals("verbose")){
140 verbose=Parse.parseBoolean(b);
141 }
142
143 else if(a.equals("idhist")){
144 outIdHist=b;
145 }
146
147 else if(a.equals("minlength") || a.equals("minlen")){
148 minMoleculeLength=Parse.parseIntKMG(b);
149 }else if(a.equals("maxlength") || a.equals("maxlen")){
150 maxMoleculeLength=Parse.parseIntKMG(b);
151 }else if(a.equals("length") || a.equals("len")){
152 minMoleculeLength=maxMoleculeLength=Parse.parseIntKMG(b);
153 }
154
155 else if(a.equals("minmovie") || a.equals("minmov")){
156 minMovie=Parse.parseIntKMG(b);
157 }else if(a.equals("maxmovie") || a.equals("maxmov")){
158 maxMovie=Parse.parseIntKMG(b);
159 }else if(a.equals("movie") || a.equals("mov")){
160 minMovie=maxMovie=Parse.parseIntKMG(b);
161 }
162
163 else if(a.equals("missingrate") || a.equals("missing")){
164 missingRate=Double.parseDouble(b);
165 assert(missingRate<=1);
166 }else if(a.equals("hiddenrate") || a.equals("hidden")){
167 hiddenRate=Double.parseDouble(b);
168 assert(hiddenRate<=1);
169 }else if(a.equals("bothends")){
170 allowBothEndsBad=Parse.parseBoolean(b);
171 assert(false) : "TODO";
172 }
173
174 else if(a.equals("gc")){
175 genomeGC=(float)Double.parseDouble(b);
176 assert(genomeGC>=0 && genomeGC<=1);
177 }else if(a.equals("genomesize")){
178 genomeSize=Parse.parseKMG(b);
179 }else if(a.equals("addns") || a.equals("ns")){
180 addNs=Parse.parseBoolean(b);
181 }else if(a.equals("seed")){
182 seed=Long.parseLong(b);
183 }else if(a.equals("zmws") || a.equals("maxzmws") || a.equals("reads")){
184 maxZMWs=Parse.parseIntKMG(b);
185 }else if(a.equals("ccs")){
186 makeCCS=Parse.parseBoolean(b);
187 }
188
189 else if(a.equals("invertedrepeatrate") || a.equals("invertrepeatrate") || a.equals("irrate")){
190 invertedRepeatRate=Double.parseDouble(b);
191 assert(invertedRepeatRate>=0);
192 }else if(a.equals("invertedrepeatminlen") || a.equals("invertrepeatminlen") || a.equals("irminlen")){
193 invertedRepeatMinLength=Parse.parseIntKMG(b);
194 }else if(a.equals("invertedrepeatmaxlen") || a.equals("invertrepeatmaxlen") || a.equals("irmaxlen")){
195 invertedRepeatMaxLength=Parse.parseIntKMG(b);
196 }else if(a.equals("invertedrepeatlen") || a.equals("invertrepeatlen") || a.equals("irlen")){
197 invertedRepeatMinLength=invertedRepeatMaxLength=Parse.parseIntKMG(b);
198 }
199
200 else if(a.equals("miner") || a.equals("minerrorrate")){
201 minErrorRate=(float)Double.parseDouble(b);
202 assert(minErrorRate>=0 && minErrorRate<=1);
203 }else if(a.equals("maxer") || a.equals("maxerrorrate")){
204 maxErrorRate=(float)Double.parseDouble(b);
205 assert(maxErrorRate>=0 && maxErrorRate<=1);
206 }else if(a.equals("er") || a.equals("errorrate")){
207 minErrorRate=maxErrorRate=(float)Double.parseDouble(b);
208 assert(minErrorRate>=0 && minErrorRate<=1);
209 }
210
211 else if(a.equals("minid") || a.equals("minidentity")){
212 maxErrorRate=1-(float)Double.parseDouble(b);
213 assert(maxErrorRate>=0 && maxErrorRate<=1);
214 }else if(a.equals("maxid") || a.equals("maxidentity")){
215 minErrorRate=1-(float)Double.parseDouble(b);
216 assert(minErrorRate>=0 && minErrorRate<=1);
217 }else if(a.equals("id") || a.equals("identity")){
218 minErrorRate=maxErrorRate=1-(float)Double.parseDouble(b);
219 assert(minErrorRate>=0 && minErrorRate<=1);
220 }else if(a.equals("adderrors")){
221 addErrors=Parse.parseBoolean(b);
222 }else if(a.equals("ref")){
223 parser.in1=b;
224 }
225
226 else if(a.equals("parse_flag_goes_here")){
227 long fake_variable=Parse.parseKMG(b);
228 //Set a variable here
229 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
230 //do nothing
231 }else{
232 outstream.println("Unknown parameter "+args[i]);
233 assert(false) : "Unknown parameter "+args[i];
234 }
235 }
236
237 return parser;
238 }
239
240 /** Add or remove .gz or .bz2 as needed */
241 private void fixExtensions(){
242 in1=Tools.fixExtension(in1);
243 }
244
245 /** Ensure files can be read and written */
246 private void checkFileExistence(){
247 //Ensure output files can be written
248 if(!Tools.testOutputFiles(overwrite, append, false, out1, outIdHist)){
249 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
250 }
251
252 //Ensure input files can be read
253 if(!Tools.testInputFiles(false, true, in1)){
254 throw new RuntimeException("\nCan't read some input files.\n");
255 }
256
257 //Ensure that no file was specified multiple times
258 if(!Tools.testForDuplicateFiles(true, in1, out1, outIdHist)){
259 throw new RuntimeException("\nSome file names were specified multiple times.\n");
260 }
261 }
262
263 /** Adjust file-related static fields as needed for this program */
264 private static void checkStatics(){
265 //Adjust the number of threads for input file reading
266 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
267 ByteFile.FORCE_MODE_BF2=true;
268 }
269
270 assert(FastaReadInputStream.settingsOK());
271 }
272
273 /** Ensure parameter ranges are within bounds and required parameters are set */
274 private boolean validateParams(){
275 assert(minMoleculeLength>0 && minMoleculeLength<=maxMoleculeLength) : minMoleculeLength+", "+maxMoleculeLength;
276 assert(minMovie>0 && minMovie<=maxMovie) : minMovie+", "+maxMovie;
277
278 assert(missingRate>=0 && missingRate<=1) : missingRate;
279 assert(hiddenRate>=0 && hiddenRate<=1) : hiddenRate;
280 assert(genomeGC>=0 && genomeGC<=1) : genomeGC;
281 assert(in1!=null || genomeSize>maxMoleculeLength) : genomeSize;
282 assert(in1!=null || invertedRepeatMaxLength*2<genomeSize) : genomeSize;
283 assert(maxZMWs>0) : "zmsw="+maxZMWs+"; please set to a positive number.";
284
285 assert(invertedRepeatRate>=0) : invertedRepeatRate;
286 assert(invertedRepeatMinLength>0 && invertedRepeatMinLength<=invertedRepeatMaxLength) : invertedRepeatMinLength+", "+invertedRepeatMaxLength;
287
288 assert(minErrorRate>=0 && minErrorRate<=maxErrorRate) : minErrorRate+", "+maxErrorRate;
289 return true;
290 }
291
292 /*--------------------------------------------------------------*/
293 /*---------------- Outer Methods ----------------*/
294 /*--------------------------------------------------------------*/
295
296 /** Create read streams and process all data */
297 void process(Timer t){
298
299 //Turn off read validation in the input threads to increase speed
300 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
301 Read.VALIDATE_IN_CONSTRUCTOR=true;
302
303 //Create a read input stream
304 final ConcurrentReadInputStream cris=makeCris();
305
306 //Optionally create a read output stream
307 final ConcurrentReadOutputStream ros=makeCros(false);
308
309 //Reset counters
310 readsProcessed=readsOut=0;
311 basesProcessed=basesOut=0;
312
313 Random randy=Shared.threadLocalRandom(seed);
314 if(cris==null){
315 ref=genSynthGenome(randy);
316 }else{
317 ref=loadData(cris, randy);
318 }
319
320 if(invertedRepeatRate>0){
321 addInvertedRepeats(ref, randy);
322 }
323
324 //Process the reads in separate threads
325 spawnThreads(cris, ros);
326
327 if(verbose){outstream.println("Finished; closing streams.");}
328
329 //Write anything that was accumulated by ReadStats
330 errorState|=ReadStats.writeAll();
331 //Close the read streams
332 errorState|=ReadWrite.closeStreams(cris, ros);
333
334 writeIdHist();
335
336 //Reset read validation
337 Read.VALIDATE_IN_CONSTRUCTOR=vic;
338
339 //Report timing and results
340 t.stop();
341 outstream.println(Tools.timeReadsBasesProcessed(t, readsOut, basesOut, 8));
342 // outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false));
343
344 //Throw an exception of there was an error in a thread
345 if(errorState){
346 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
347 }
348 }
349
350 private void writeIdHist(){
351 if(ffIdHist==null){return;}
352 ByteStreamWriter bsw=new ByteStreamWriter(ffIdHist);
353 bsw.start();
354 bsw.print("#Identity\tCount\n".getBytes());
355 for(int i=0; i<idHist.length; i++){
356 bsw.print(i*100.0/(ID_BINS-1), 3).print('\t').println(idHist[i]);
357 }
358 errorState|=bsw.poisonAndWait();
359 }
360
361 /** Create a Read Input Stream */
362 private ConcurrentReadInputStream makeCris(){
363 if(ffin1==null){return null;}
364 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
365 cris.start(); //Start the stream
366 if(verbose){outstream.println("Started cris");}
367 return cris;
368 }
369
370 /** Create a Read Output Stream */
371 private ConcurrentReadOutputStream makeCros(boolean pairedInput){
372 if(ffout1==null){return null;}
373
374 //Set output buffer size
375 final int buff=4;
376
377 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(
378 ffout1, null, qfout1, null, buff, null, false);
379 ros.start(); //Start the stream
380 return ros;
381 }
382
383 /** Spawn process threads */
384 private void spawnThreads(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
385
386 //Do anything necessary prior to processing
387
388 //Determine how many threads may be used
389 final int threads=Shared.threads();
390
391 //Fill a list with ProcessThreads
392 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
393 for(int i=0; i<threads; i++){
394 alpt.add(new ProcessThread(ros, i, nextZmwID, seed));
395 }
396
397 //Start the threads
398 for(ProcessThread pt : alpt){
399 pt.start();
400 }
401
402 //Wait for threads to finish
403 waitForThreads(alpt);
404
405 //Do anything necessary after processing
406
407 }
408
409 /** Wait until all worker threads are finished, then return */
410 private void waitForThreads(ArrayList<ProcessThread> alpt){
411
412 //Wait for completion of all threads
413 boolean success=true;
414 for(ProcessThread pt : alpt){
415
416 //Wait until this thread has terminated
417 while(pt.getState()!=Thread.State.TERMINATED){
418 try {
419 //Attempt a join operation
420 pt.join();
421 } catch (InterruptedException e) {
422 //Potentially handle this, if it is expected to occur
423 e.printStackTrace();
424 }
425 }
426
427 //Accumulate per-thread statistics
428 readsOut+=pt.readsOutT;
429 basesOut+=pt.basesOutT;
430 Tools.add(idHist, pt.idHistT);
431
432 success&=pt.success;
433 }
434
435 //Track whether any threads failed
436 if(!success){errorState=true;}
437 }
438
439 /*--------------------------------------------------------------*/
440 /*---------------- Inner Methods ----------------*/
441 /*--------------------------------------------------------------*/
442
443 private byte randomBase(Random randy) {
444 float rGC=randy.nextFloat();
445 if(rGC>=genomeGC){//AT
446 return (byte)(randy.nextBoolean() ? 'A' : 'T');
447 }else{//GC
448 return (byte)(randy.nextBoolean() ? 'G' : 'C');
449 }
450 }
451
452 private static int randomLength(int min, int max, Random randy) {
453 if(min==max){return min;}
454 int range=max-min+1;
455 int x=min+Tools.min(randy.nextInt(range), randy.nextInt(range));
456 // System.err.println(x+", "+min+", "+max);
457 // new Exception().printStackTrace();
458 // System.err.println(randy.getClass());
459 // System.err.println(randy.nextLong());
460 return x;
461 }
462
463 private static float randomRate(float min, float max, Random randy) {
464 if(min==max){return min;}
465 float range=max-min;
466 // float a=(randy.nextFloat()+randy.nextFloat());
467 float b=(randy.nextFloat()+randy.nextFloat());
468 float c=(1.6f*randy.nextFloat()+0.4f*randy.nextFloat());
469 float x=min+range*0.5f*Tools.min(b, c);
470 // assert(false) : "x="+x+", a="+a+", b="+b+", c="+c+", min="+min+", max="+max;
471 // System.err.println(x+", "+min+", "+max);
472 return x;
473 }
474
475 private byte[] genSynthGenome(Random randy){
476 assert(genomeSize<=MAX_GENOME_LENGTH) : genomeSize;
477 byte[] array=new byte[(int)genomeSize];
478 for(int i=0; i<genomeSize; i++){
479 array[i]=randomBase(randy);
480 }
481 return array;
482 }
483
484 private byte[] loadData(ConcurrentReadInputStream cris, Random randy){
485
486 ByteBuilder bb=new ByteBuilder();
487
488 //Grab the first ListNum of reads
489 ListNum<Read> ln=cris.nextList();
490
491 //As long as there is a nonempty read list...
492 while(ln!=null && ln.size()>0){
493 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
494
495 for(Read r : ln){
496
497 // System.err.println("Fetched read len="+r.length());//123
498
499 //Increment counters
500 readsProcessed+=r.pairCount();
501 basesProcessed+=r.pairLength();
502
503
504
505 // System.err.println("Filter: addNs="+addNs+", (r.length()>maxMoleculeLength && (invertedRepeatRate<=0 || r.length()>2*invertedRepeatMaxLength);//123
506
507 //Filter disabled; it causes short sequences to be ignored.
508 //Optional filter criteria
509 // if(!addNs || (r.length()>maxMoleculeLength && (invertedRepeatRate<=0 || r.length()>2*invertedRepeatMaxLength))){
510 final byte[] bases=r.bases;
511
512 if(addNs){
513 if(bb.length()>0){bb.append('N');}
514 }else{
515 for(int i=0; i<bases.length; i++){
516 if(!AminoAcid.isFullyDefined(bases[i])){
517 bases[i]=randomBase(randy);
518 }
519 }
520 }
521
522 if(bb.length()+bases.length<=MAX_GENOME_LENGTH){
523 bb.append(bases);
524 // System.err.println("Appended "+r.length());//123
525 }else{
526 // System.err.println("Appending partial.");//123
527 for(byte b : bases){
528 bb.append(b);
529 if(bb.length>=MAX_GENOME_LENGTH){
530 cris.returnList(ln.id, true);
531 // System.err.println("Returning partial "+bb.length);//123
532 return bb.toBytes();
533 }
534 }
535 }
536 // }
537 }
538
539 //Notify the input stream that the list was used
540 cris.returnList(ln);
541 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
542
543 //Fetch a new list
544 ln=cris.nextList();
545 }
546
547 //Notify the input stream that the final list was used
548 if(ln!=null){
549 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
550 }
551
552 // System.err.println("Returning full "+bb.length);//123
553 return bb.toBytes();
554 }
555
556 private void addInvertedRepeats(byte[] bases, Random randy){
557
558 long added=0;
559 long toAdd=(long) (bases.length*invertedRepeatRate);
560 while(added<toAdd){
561 int len=randomLength(invertedRepeatMinLength, invertedRepeatMaxLength, randy);
562 int start=randy.nextInt(bases.length-2*len);
563 int stop=start+len;
564 boolean OK=true;
565 for(int i=0; i<len && OK; i++){
566 OK&=(bases[start+i]!='N' && bases[stop+i]!='N');
567 }
568 if(OK){
569 for(int i=0; i<len; i++){
570 byte b=bases[stop-i-1];
571 bases[stop+i]=AminoAcid.baseToComplementExtended[b];
572 }
573 added+=2*len;
574 // System.err.println("added="+added+"/"+toAdd);
575 }else{
576 //
577 }
578 }
579
580 }
581
582 /*--------------------------------------------------------------*/
583 /*---------------- Inner Classes ----------------*/
584 /*--------------------------------------------------------------*/
585
586 /** */
587 private class ProcessThread extends Thread {
588
589 //Constructor
590 ProcessThread(final ConcurrentReadOutputStream ros_, final int tid_,
591 final AtomicLong nextZmwID_, final long seed){
592 ros=ros_;
593 tid=tid_;
594 atomicZmwID=nextZmwID_;
595 // assert(false) : randy.getClass()+", "+randy.nextLong();
596 }
597
598 //Called by start()
599 @Override
600 public void run(){
601 //Do anything necessary prior to processing
602 randy=Shared.threadLocalRandom(seed<0 ? seed : seed+(tid+1)*tid*1000L);
603
604 //Process the reads
605 processInner();
606
607 //Do anything necessary after processing
608
609 //Indicate successful exit status
610 success=true;
611 }
612
613 /** Iterate through the reads */
614 void processInner(){
615
616 //As long as there is a nonempty read list...
617 for(long generated=atomicZmwID.getAndAdd(readsPerList); generated<maxZMWs;
618 generated=atomicZmwID.getAndAdd(readsPerList)){
619 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
620
621 long toGenerate=Tools.min(readsPerList, maxZMWs-generated);
622
623 ArrayList<Read> reads=generateList((int)toGenerate, generated);
624
625 //Output reads to the output stream
626 if(ros!=null){ros.add(reads, 0);}
627 }
628 }
629
630 /** Generate the next list of reads */
631 private ArrayList<Read> generateList(int toGenerate, long nextID){
632
633 //Grab the actual read list from the ListNum
634 final ArrayList<Read> reads=new ArrayList<Read>(toGenerate);
635
636 //Loop through each read in the list
637 for(int i=0; i<toGenerate; i++, nextID++){
638 ArrayList<Read> zmw=generateZMW(nextID);
639 if(zmw==null){
640 i--;
641 nextID--;
642 }else{reads.addAll(zmw);}
643 }
644
645 return reads;
646 }
647
648 private ReadBuilder median(ArrayList<ReadBuilder> list){
649 if(list.size()<3){return null;}
650 IntList lengths=new IntList(list.size()-2);
651
652 for(int i=1; i<list.size()-1; i++){
653 lengths.add(list.get(i).length());
654 }
655 lengths.sort();
656 int median=lengths.get((lengths.size-1)/2);
657
658 for(int i=1; i<list.size()-1; i++){
659 ReadBuilder rb=list.get(i);
660 if(rb.length()==median){
661 return rb;
662 }
663 }
664
665 assert(false);
666 return null;
667 }
668
669 /**
670 * Generate a single read.
671 */
672 private ArrayList<Read> generateZMW(final long nextID){
673
674 final int movieLength=randomLength(minMovie, maxMovie, randy);
675 final float errorRate=randomRate(minErrorRate, maxErrorRate, randy);
676
677 ArrayList<ReadBuilder> baseCalls=baseCallAllPasses(movieLength, errorRate, nextID);
678 if(baseCalls==null){
679 // System.err.println(movieLength+", "+errorRate+", "+nextID);//123
680 return null;
681 }
682
683 final boolean missing=randy.nextFloat()<missingRate;
684 if(missing){
685 final int missingMod=randy.nextInt(2);
686 ArrayList<ReadBuilder> temp=new ArrayList<ReadBuilder>();
687
688 ReadBuilder current=null;
689 for(int i=0; i<baseCalls.size(); i++){
690 ReadBuilder rb=baseCalls.get(i);
691 assert(rb.subreads==1);
692 assert(rb.missing==0);
693 assert(rb.adapters==0);
694 if(current!=null && (i&1)==missingMod){
695 current.add(rb);
696 current.missing++;
697 assert(current.subreads>1);
698 assert(current.missing>0);
699 temp.add(current);
700 current=null;
701 }else{
702 if(current!=null){temp.add(current);}
703 current=rb;
704 }
705 }
706 if(current!=null){temp.add(current);}
707 baseCalls=temp;
708 }
709
710 if(makeCCS){
711 ReadBuilder median=median(baseCalls);
712 if(median==null){return null;}
713 baseCalls.clear();
714 baseCalls.add(median);
715 }else if(hiddenRate>0){
716 ArrayList<ReadBuilder> temp=new ArrayList<ReadBuilder>();
717
718 ReadBuilder current=null;
719 for(int i=0; i<baseCalls.size(); i++){
720 ReadBuilder rb=baseCalls.get(i);
721 assert(rb.adapters==0);
722 if(current!=null && randy.nextFloat()<hiddenRate){
723 current.add(baseCallAdapter(errorRate));
724 current.add(rb);
725 assert(current.adapters>0);
726 assert(current.subreads>1);
727 }else{
728 if(current!=null){temp.add(current);}
729 current=rb;
730 assert(current.adapters==0);
731 }
732 }
733 if(current!=null){temp.add(current);}
734 baseCalls=temp;
735 }
736
737
738 ArrayList<Read> reads=new ArrayList<Read>();
739 for(ReadBuilder rb : baseCalls){
740 Read r=rb.toRead();
741 readsOutT+=r.pairCount();
742 basesOutT+=r.length();
743 reads.add(r);
744 }
745
746 idHistT[(int)((1-errorRate)*(ID_BINS-1)+0.5f)]++;
747 return reads;
748 }
749
750 private ArrayList<ReadBuilder> baseCallAllPasses(final int movieLength, final float errorRate, long zmw){
751 byte[] frag=null;
752
753 for(int i=0; i<10 && frag==null; i++){//retry several times
754 frag=fetchBases(ref);
755 }
756
757 if(frag==null){
758 // System.err.println("Failed baseCallAllPasses("+movieLength+", "+errorRate+", "+zmw);//123
759 return null;
760 }
761
762 ArrayList<ReadBuilder> list=new ArrayList<ReadBuilder>();
763 int movieRemaining=movieLength;
764 int moviePos=0;
765 assert(frag.length>0) : frag.length;
766 int start=randy.nextInt(frag.length);
767 while(movieRemaining>0){
768 ReadBuilder rb=baseCallOnePass(frag, errorRate, start, moviePos, movieRemaining, zmw);
769 list.add(rb);
770 start=0;
771 int elapsed=rb.length()+adapterLen;
772 moviePos+=elapsed;
773 movieRemaining-=elapsed;
774 AminoAcid.reverseComplementBasesInPlace(frag);
775 }
776 return list;
777 }
778
779 /** Call bases for one pass */
780 private ReadBuilder baseCallOnePass(final byte[] frag, final float errorRate, final int start, final int movieStart, int movieRemaining, long zmw){
781 final float mult=1/(1-errorRate);
782 ByteBuilder bb=new ByteBuilder();
783 int fpos=start;
784 for(; fpos<frag.length && movieRemaining>0; fpos++){
785 float f=randy.nextFloat();
786 byte b=frag[fpos];
787 if(f>=errorRate){
788 bb.append(b);
789 movieRemaining--;
790 }else{
791 f=mult*(1-f);
792 if(f<insThresh){//ins
793 b=AminoAcid.numberToBase[randy.nextInt(4)];
794 bb.append(b);
795 fpos--;
796 movieRemaining--;
797 }else if(f<delThresh){//del
798
799 }else{//sub
800 int x=AminoAcid.baseToNumber[b]+randy.nextInt(3)+1;
801 b=AminoAcid.numberToBase[x&3];
802 bb.append(b);
803 movieRemaining--;
804 }
805 }
806 }
807
808 float passes=(fpos-start)*1.0f/frag.length;
809 ReadBuilder rb=new ReadBuilder(bb, passes, movieStart, zmw);
810 rb.errorRate=errorRate;
811 return rb;
812 }
813
814 /** Call bases for an adapter sequence pass */
815 private ReadBuilder baseCallAdapter(final float errorRate){
816 ReadBuilder rb=baseCallOnePass(pacbioAdapter, errorRate, 0, 0, 999999, 0);
817 rb.passes=0;
818 rb.fullPasses=0;
819 rb.subreads=0;
820 rb.adapters=1;
821 return rb;
822 }
823
824 private byte[] fetchBases(byte[] source){
825
826 final int len=randomLength(minMoleculeLength, maxMoleculeLength, randy);
827 int start=len>=source.length ? 0 : randy.nextInt(source.length-len);
828 int stop=Tools.min(start+len, source.length);
829
830 // System.err.println("fetchBases(len="+len+", slen="+source.length+", start="+start+", stop="+stop);//123
831
832 for(int i=start; i<stop; i++){
833 if(!AminoAcid.isFullyDefined(source[i])){
834 return null;
835 }
836 }
837 if(stop-start<1){return null;}
838 byte[] frag=Arrays.copyOfRange(source, start, stop);
839 if(randy.nextBoolean()){AminoAcid.reverseComplementBasesInPlace(frag);}
840 return frag;
841 }
842
843 /** Number of reads retained by this thread */
844 protected long readsOutT=0;
845 /** Number of bases retained by this thread */
846 protected long basesOutT=0;
847
848 /** True only if this thread has completed successfully */
849 boolean success=false;
850
851 private final AtomicLong atomicZmwID;
852 private final int readsPerList=Shared.bufferLen();
853
854 /** Random number source */
855 private Random randy;
856
857 /** Random number source */
858 private final long[] idHistT=new long[ID_BINS];
859
860 /** Shared output stream */
861 private final ConcurrentReadOutputStream ros;
862 /** Thread ID */
863 final int tid;
864 }
865
866 /*--------------------------------------------------------------*/
867
868
869
870 /*--------------------------------------------------------------*/
871 /*---------------- Fields ----------------*/
872 /*--------------------------------------------------------------*/
873
874 /** Primary input file path */
875 private String in1=null;
876
877 /** Primary output file path */
878 private String out1=null;
879
880 /** Primary output file path */
881 private String outIdHist=null;
882
883 private String qfout1=null;
884
885 /** Override input file extension */
886 private String extin=null;
887 /** Override output file extension */
888 private String extout=null;
889
890 /*--------------------------------------------------------------*/
891
892 /** */
893 private int minMoleculeLength=500;
894 /** */
895 private int maxMoleculeLength=10000;
896 /** */
897 private int minMovie=500;
898 /** */
899 private int maxMovie=40000;
900
901 /** */
902 private double missingRate=0.0;
903 /** */
904 private double hiddenRate=0.0;
905 /** */
906 private boolean allowBothEndsBad=false;
907
908 /** */
909 private float genomeGC=0.6f;
910 /** */
911 private long genomeSize=10000000;
912 /** */
913 private boolean addNs=true;
914
915 /** */
916 private double invertedRepeatRate=0.0;
917 /** */
918 private int invertedRepeatMinLength=100;
919 /** */
920 private int invertedRepeatMaxLength=10000;
921
922 /** */
923 private float minErrorRate=0.05f;
924 /** */
925 private float maxErrorRate=0.25f;
926 /** */
927 private boolean addErrors=true;
928 /** One read per ZMW */
929 private boolean makeCCS=false;
930
931 /** */
932 private long seed=-1;
933
934 private long[] idHist=new long[ID_BINS];
935
936 //These should add to 1
937 private float insFraction=0.40f;
938 private float delFraction=0.35f;
939 private float subFraction=0.25f;
940
941 private final float insThresh;
942 private final float delThresh;
943
944 /*--------------------------------------------------------------*/
945
946 /** Number of reads processed */
947 protected long readsProcessed=0;
948 /** Number of bases processed */
949 protected long basesProcessed=0;
950
951 /** Number of reads retained */
952 protected long readsOut=0;
953 /** Number of bases retained */
954 protected long basesOut=0;
955
956 /** Quit after processing this many INPUT reads */
957 private long maxReads=-1;
958
959 /** Quit after generating this many OUTPUT zmws */
960 private long maxZMWs=-1;
961
962 /** Reference genome, max 2Gbp */
963 private byte[] ref;
964
965 private AtomicLong nextZmwID=new AtomicLong(0);
966
967 /*--------------------------------------------------------------*/
968 /*---------------- Final Fields ----------------*/
969 /*--------------------------------------------------------------*/
970
971 /** Primary input file */
972 private final FileFormat ffin1;
973
974 /** Primary output file */
975 private final FileFormat ffout1;
976
977 /** Primary output file */
978 private final FileFormat ffIdHist;
979
980 /*--------------------------------------------------------------*/
981 /*---------------- Constants ----------------*/
982 /*--------------------------------------------------------------*/
983
984 private static final int ID_BINS=201;
985
986 private static final long MAX_GENOME_LENGTH=2000000000;
987
988 public static final byte[] pacbioAdapter="ATCTCTCTCAACAACAACAACGGAGGAGGAGGAAAAGAGAGAGAT".getBytes();
989 public static final int adapterLen=pacbioAdapter.length;
990
991 /*--------------------------------------------------------------*/
992 /*---------------- Common Fields ----------------*/
993 /*--------------------------------------------------------------*/
994
995 /** Print status messages to this output stream */
996 private PrintStream outstream=System.err;
997 /** Print verbose messages */
998 public static boolean verbose=false;
999 /** True if an error was encountered */
1000 public boolean errorState=false;
1001 /** Overwrite existing output files */
1002 private boolean overwrite=false;
1003 /** Append to existing output files */
1004 private boolean append=false;
1005
1006 }