comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/CallGenes.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package prok;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7
8 import dna.AminoAcid;
9 import dna.Data;
10 import fileIO.ByteFile;
11 import fileIO.ByteStreamWriter;
12 import fileIO.FileFormat;
13 import fileIO.ReadWrite;
14 import gff.CompareGff;
15 import jgi.BBMerge;
16 import json.JsonObject;
17 import shared.Parse;
18 import shared.Parser;
19 import shared.PreParser;
20 import shared.ReadStats;
21 import shared.Shared;
22 import shared.Timer;
23 import shared.Tools;
24 import stream.ConcurrentReadInputStream;
25 import stream.ConcurrentReadOutputStream;
26 import stream.Read;
27 import structures.ByteBuilder;
28 import structures.ListNum;
29
30 /**
31 * This is the executable class for gene-calling.
32 * @author Brian Bushnell
33 * @date Sep 24, 2018
34 *
35 */
36 public class CallGenes extends ProkObject {
37
38 /*--------------------------------------------------------------*/
39 /*---------------- Initialization ----------------*/
40 /*--------------------------------------------------------------*/
41
42 /**
43 * Code entrance from the command line.
44 * @param args Command line arguments
45 */
46 public static void main(String[] args){
47 //Start a timer immediately upon code entrance.
48 Timer t=new Timer();
49
50 //Create an instance of this class
51 CallGenes x=new CallGenes(args);
52
53 //Run the object
54 x.process(t);
55
56 //Close the print stream if it was redirected
57 Shared.closeStream(x.outstream);
58 }
59
60 /**
61 * Constructor.
62 * @param args Command line arguments
63 */
64 public CallGenes(String[] args){
65
66 {//Preparse block for help, config files, and outstream
67 PreParser pp=new PreParser(args, (args.length>40 ? null : getClass()), false);
68 args=pp.args;
69 outstream=pp.outstream;
70 }
71
72 //Set shared static variables prior to parsing
73 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
74 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
75
76 {//Parse the arguments
77 final Parser parser=parse(args);
78 overwrite=parser.overwrite;
79 append=parser.append;
80
81 outGff=parser.out1;
82 maxReads=parser.maxReads;
83 }
84
85 fixExtensions(); //Add or remove .gz or .bz2 as needed
86 checkFileExistence(); //Ensure files can be read and written
87 checkStatics(); //Adjust file-related static fields as needed for this program
88
89 ffoutGff=FileFormat.testOutput(outGff, FileFormat.GFF, null, true, overwrite, append, ordered);
90 ffoutAmino=FileFormat.testOutput(outAmino, FileFormat.FA, null, true, overwrite, append, ordered);
91 ffout16S=FileFormat.testOutput(out16S, FileFormat.FA, null, true, overwrite, append, ordered);
92 ffout18S=FileFormat.testOutput(out18S, FileFormat.FA, null, true, overwrite, append, ordered);
93
94 if(ffoutGff!=null){
95 assert(!ffoutGff.isSequence()) : "\nout is for gff files. To output sequence, please use outa.";
96 }
97 if(ffoutAmino!=null){
98 assert(!ffoutAmino.gff()) : "\nouta is for sequence data. To output gff, please use out.";
99 }
100 if(ffout16S!=null){
101 assert(!ffout16S.gff()) : "\nout16S is for sequence data. To output gff, please use out.";
102 }
103 if(ffout18S!=null){
104 assert(!ffout18S.gff()) : "\nout18S is for sequence data. To output gff, please use out.";
105 }
106
107 if(geneHistFile==null){geneHistBins=0;}
108 else{
109 assert(geneHistBins>1) : "geneHistBins="+geneHistBins+"; should be >1";
110 assert(geneHistDiv>=1) : "geneHistDiv="+geneHistDiv+"; should be >=1";
111 }
112 geneHist=geneHistBins>1 ? new long[geneHistBins] : null;
113 }
114
115 /*--------------------------------------------------------------*/
116 /*---------------- Initialization Helpers ----------------*/
117 /*--------------------------------------------------------------*/
118
119 /** Parse arguments from the command line */
120 private Parser parse(String[] args){
121
122 Parser parser=new Parser();
123 for(int i=0; i<args.length; i++){
124 String arg=args[i];
125 String[] split=arg.split("=");
126 String a=split[0].toLowerCase();
127 String b=split.length>1 ? split[1] : null;
128 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
129
130 // outstream.println(arg+", "+a+", "+b);
131 if(PGMTools.parseStatic(arg, a, b)){
132 //do nothing
133 }else if(a.equals("in") || a.equals("infna") || a.equals("fnain") || a.equals("fna")){
134 assert(b!=null);
135 Tools.addFiles(b, fnaList);
136 }else if(b==null && new File(arg).exists() && FileFormat.isFastaFile(arg)){
137 fnaList.add(arg);
138 }else if(a.equals("pgm") || a.equals("gm") || a.equals("model")){
139 assert(b!=null);
140 if(b.equalsIgnoreCase("auto") || b.equalsIgnoreCase("default")){
141 b=Data.findPath("?model.pgm");
142 pgmList.add(b);
143 }else{
144 Tools.addFiles(b, pgmList);
145 }
146 }else if(b==null && new File(arg).exists() && FileFormat.isPgmFile(arg)){
147 Tools.addFiles(arg, pgmList);
148 }else if(a.equals("outamino") || a.equals("aminoout") || a.equals("outa") || a.equals("outaa") || a.equals("aaout") || a.equals("amino")){
149 outAmino=b;
150 }else if(a.equalsIgnoreCase("out16s") || a.equalsIgnoreCase("16sout")){
151 out16S=b;
152 }else if(a.equalsIgnoreCase("out18s") || a.equalsIgnoreCase("18sout")){
153 out18S=b;
154 }else if(a.equals("verbose")){
155 verbose=Parse.parseBoolean(b);
156 //ReadWrite.verbose=verbose;
157 GeneCaller.verbose=verbose;
158 }
159
160 else if(a.equals("json_out") || a.equalsIgnoreCase("json")){
161 json_out=Parse.parseBoolean(b);
162 }else if(a.equals("stats") || a.equalsIgnoreCase("outstats")){
163 outStats=b;
164 }else if(a.equals("hist") || a.equalsIgnoreCase("outhist") || a.equalsIgnoreCase("lengthhist") || a.equalsIgnoreCase("lhist") || a.equalsIgnoreCase("genehist")){
165 geneHistFile=b;
166 }else if(a.equals("bins")){
167 geneHistBins=Integer.parseInt(b);
168 }else if(a.equals("binlen") || a.equals("binlength") || a.equals("histdiv")){
169 geneHistBins=Integer.parseInt(b);
170 }else if(a.equals("printzero") || a.equals("pz")){
171 printZeroCountHist=Parse.parseBoolean(b);
172 }
173
174 else if(a.equals("merge")){
175 merge=Parse.parseBoolean(b);
176 }else if(a.equals("ecco")){
177 ecco=Parse.parseBoolean(b);
178 }
179
180 else if(a.equalsIgnoreCase("setbias16s")) {
181 GeneCaller.biases[r16S]=Float.parseFloat(b);
182 }else if(a.equalsIgnoreCase("setbias18s")) {
183 GeneCaller.biases[r18S]=Float.parseFloat(b);
184 }else if(a.equalsIgnoreCase("setbias23s")) {
185 GeneCaller.biases[r23S]=Float.parseFloat(b);
186 }else if(a.equalsIgnoreCase("setbias5s")) {
187 GeneCaller.biases[r5S]=Float.parseFloat(b);
188 }else if(a.equalsIgnoreCase("setbiastRNA")) {
189 GeneCaller.biases[tRNA]=Float.parseFloat(b);
190 }else if(a.equalsIgnoreCase("setbiasCDS")) {
191 GeneCaller.biases[CDS]=Float.parseFloat(b);
192 }
193
194 else if(a.equals("ordered")){
195 ordered=Parse.parseBoolean(b);
196 }
197
198 else if(a.equals("translate")){
199 mode=TRANSLATE;
200 }else if(a.equals("retranslate") || a.equals("detranslate")){
201 mode=RETRANSLATE;
202 }else if(a.equals("recode")){
203 mode=RECODE;
204 }
205
206 else if(a.equalsIgnoreCase("minlen") || a.equals("minlength")){
207 minLen=Integer.parseInt(b);
208 }else if(a.equals("maxoverlapss") || a.equals("overlapss") || a.equals("overlapsamestrand") || a.equals("moss") || a.equalsIgnoreCase("maxOverlapSameStrand")){
209 maxOverlapSameStrand=Integer.parseInt(b);
210 }else if(a.equals("maxoverlapos") || a.equals("overlapos") || a.equals("overlapoppositestrand") || a.equals("moos") || a.equalsIgnoreCase("maxOverlapOppositeStrand")){
211 maxOverlapOppositeStrand=Integer.parseInt(b);
212 }else if(a.equalsIgnoreCase("minStartScore")){
213 minStartScore=Float.parseFloat(b);
214 }else if(a.equalsIgnoreCase("minStopScore")){
215 minStopScore=Float.parseFloat(b);
216 }else if(a.equalsIgnoreCase("minInnerScore") || a.equalsIgnoreCase("minKmerScore")){
217 minKmerScore=Float.parseFloat(b);
218 }else if(a.equalsIgnoreCase("minOrfScore") || a.equalsIgnoreCase("minScore")){
219 minOrfScore=Float.parseFloat(b);
220 }else if(a.equalsIgnoreCase("minAvgScore")){
221 minAvgScore=Float.parseFloat(b);
222 }else if(a.equalsIgnoreCase("breakLimit")){
223 GeneCaller.breakLimit=Integer.parseInt(b);
224 }else if(a.equalsIgnoreCase("clearcutoffs") || a.equalsIgnoreCase("clearfilters")){
225 GeneCaller.breakLimit=9999;
226 minOrfScore=-9999;
227 minAvgScore=-9999;
228 minKmerScore=-9999;
229 minStopScore=-9999;
230 minStartScore=-9999;
231 }
232
233 else if(a.equalsIgnoreCase("e1")){
234 Orf.e1=Float.parseFloat(b);
235 }else if(a.equalsIgnoreCase("e2")){
236 Orf.e2=Float.parseFloat(b);
237 }else if(a.equalsIgnoreCase("e3")){
238 Orf.e3=Float.parseFloat(b);
239 }
240 else if(a.equalsIgnoreCase("f1")){
241 Orf.f1=Float.parseFloat(b);
242 }else if(a.equalsIgnoreCase("f2")){
243 Orf.f2=Float.parseFloat(b);
244 }else if(a.equalsIgnoreCase("f3")){
245 Orf.f3=Float.parseFloat(b);
246 }
247 else if(a.equalsIgnoreCase("p0")){
248 GeneCaller.p0=Float.parseFloat(b);
249 }else if(a.equalsIgnoreCase("p1")){
250 GeneCaller.p1=Float.parseFloat(b);
251 }else if(a.equalsIgnoreCase("p2")){
252 GeneCaller.p2=Float.parseFloat(b);
253 }else if(a.equalsIgnoreCase("p3")){
254 GeneCaller.p3=Float.parseFloat(b);
255 }else if(a.equalsIgnoreCase("p4")){
256 GeneCaller.p4=Float.parseFloat(b);
257 }else if(a.equalsIgnoreCase("p5")){
258 GeneCaller.p5=Float.parseFloat(b);
259 }else if(a.equalsIgnoreCase("p6")){
260 GeneCaller.p6=Float.parseFloat(b);
261 }
262 else if(a.equalsIgnoreCase("q1")){
263 GeneCaller.q1=Float.parseFloat(b);
264 }else if(a.equalsIgnoreCase("q2")){
265 GeneCaller.q2=Float.parseFloat(b);
266 }else if(a.equalsIgnoreCase("q3")){
267 GeneCaller.q3=Float.parseFloat(b);
268 }else if(a.equalsIgnoreCase("q4")){
269 GeneCaller.q4=Float.parseFloat(b);
270 }else if(a.equalsIgnoreCase("q5")){
271 GeneCaller.q5=Float.parseFloat(b);
272 }
273 else if(a.equalsIgnoreCase("lookback")){
274 GeneCaller.lookbackPlus=GeneCaller.lookbackMinus=Integer.parseInt(b);
275 }else if(a.equalsIgnoreCase("lookbackplus")){
276 GeneCaller.lookbackPlus=Integer.parseInt(b);
277 }else if(a.equalsIgnoreCase("lookbackminus")){
278 GeneCaller.lookbackMinus=Integer.parseInt(b);
279 }
280
281 else if(a.equalsIgnoreCase("compareto")){
282 compareToGff=b;
283 }
284
285 else if(ProkObject.parse(arg, a, b)){}
286
287 else if(parser.parse(arg, a, b)){
288 //do nothing
289 }else{
290 outstream.println("Unknown parameter "+args[i]);
291 assert(false) : "Unknown parameter "+args[i];
292 // throw new RuntimeException("Unknown parameter "+args[i]);
293 }
294 }
295
296 if(pgmList.isEmpty()){
297 String b=Data.findPath("?model.pgm");
298 pgmList.add(b);
299 }
300 for(int i=0; i<pgmList.size(); i++){
301 String s=pgmList.get(i);
302 if(s.equalsIgnoreCase("auto") || s.equalsIgnoreCase("default")){
303 pgmList.set(i, Data.findPath("?model.pgm"));
304 }
305 }
306
307 if(Shared.threads()<2){ordered=false;}
308 assert(!fnaList.isEmpty()) : "At least 1 fasta file is required.";
309 assert(!pgmList.isEmpty()) : "At least 1 pgm file is required.";
310 return parser;
311 }
312
313 /** Add or remove .gz or .bz2 as needed */
314 private void fixExtensions(){
315 fnaList=Tools.fixExtension(fnaList);
316 pgmList=Tools.fixExtension(pgmList);
317 if(fnaList.isEmpty()){throw new RuntimeException("Error - at least one input file is required.");}
318 }
319
320 /** Ensure files can be read and written */
321 private void checkFileExistence(){
322 //Ensure output files can be written
323 if(!Tools.testOutputFiles(overwrite, append, false, outGff, outAmino, out16S, out18S, outStats, geneHistFile)){
324 outstream.println((outGff==null)+", "+outGff);
325 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "
326 +outGff+", "+outAmino+", "+out16S+", "+out18S+", "+outStats+", "+geneHistFile+"\n");
327 }
328
329 //Ensure input files can be read
330 ArrayList<String> foo=new ArrayList<String>();
331 foo.addAll(fnaList);
332 foo.addAll(pgmList);
333 if(!Tools.testInputFiles(false, true, foo.toArray(new String[0]))){
334 throw new RuntimeException("\nCan't read some input files.\n");
335 }
336
337 //Ensure that no file was specified multiple times
338 foo.add(outGff);
339 foo.add(outAmino);
340 foo.add(out16S);
341 foo.add(out18S);
342 foo.add(outStats);
343 foo.add(geneHistFile);
344 if(!Tools.testForDuplicateFiles(true, foo.toArray(new String[0]))){
345 throw new RuntimeException("\nSome file names were specified multiple times.\n");
346 }
347 }
348
349 /** Adjust file-related static fields as needed for this program */
350 private static void checkStatics(){
351 //Adjust the number of threads for input file reading
352 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
353 ByteFile.FORCE_MODE_BF2=true;
354 }
355 }
356
357 /*--------------------------------------------------------------*/
358 /*---------------- Outer Methods ----------------*/
359 /*--------------------------------------------------------------*/
360
361 /** Create read streams and process all data */
362 void process(Timer t){
363
364 GeneModel pgm=PGMTools.loadAndMerge(pgmList);
365
366 if(call16S || call18S || call23S || calltRNA || call5S){
367 loadLongKmers();
368 loadConsensusSequenceFromFile(false, false);
369 }
370
371 ByteStreamWriter bsw=makeBSW(ffoutGff);
372 if(bsw!=null){
373 bsw.forcePrint("##gff-version 3\n");
374 }
375
376 ConcurrentReadOutputStream rosAmino=makeCros(ffoutAmino);
377 ConcurrentReadOutputStream ros16S=makeCros(ffout16S);
378 ConcurrentReadOutputStream ros18S=makeCros(ffout18S);
379
380 //Turn off read validation in the input threads to increase speed
381 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR;
382 Read.VALIDATE_IN_CONSTRUCTOR=Shared.threads()<4;
383
384 //Reset counters
385 readsIn=genesOut=0;
386 basesIn=bytesOut=0;
387
388 for(String fname : fnaList){
389 //Create a read input stream
390 final ConcurrentReadInputStream cris=makeCris(fname);
391
392 //Process the reads in separate threads
393 spawnThreads(cris, bsw, rosAmino, ros16S, ros18S, pgm);
394
395 //Close the input stream
396 errorState|=ReadWrite.closeStream(cris);
397 }
398
399 //Close the input stream
400 errorState|=ReadWrite.closeStreams(null, rosAmino, ros16S, ros18S);
401
402 if(verbose){outstream.println("Finished; closing streams.");}
403
404 //Write anything that was accumulated by ReadStats
405 errorState|=ReadStats.writeAll();
406 //Close the output stream
407 if(bsw!=null){errorState|=bsw.poisonAndWait();}
408
409 //Reset read validation
410 Read.VALIDATE_IN_CONSTRUCTOR=vic;
411
412 //Report timing and results
413 t.stop();
414 outstream.println(Tools.timeReadsBasesProcessed(t, readsIn, basesIn, 8));
415 outstream.println(Tools.linesBytesOut(readsIn, basesIn, genesOut, bytesOut, 8, false));
416 outstream.println();
417
418 if(json_out){
419 printStatsJson(outStats);
420 }else{
421 printStats(outStats);
422 }
423
424 if(geneHistFile!=null){
425 printHist(geneHistFile);
426 }
427
428 //Throw an exception of there was an error in a thread
429 if(errorState){
430 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
431 }
432
433 if(compareToGff!=null){
434 if(compareToGff.equals("auto")){
435 compareToGff=fnaList.get(0).replace(".fna", ".gff");
436 compareToGff=compareToGff.replace(".fa", ".gff");
437 compareToGff=compareToGff.replace(".fasta", ".gff");
438 }
439 CompareGff.main(new String[] {outGff, compareToGff});
440 }
441 }
442
443 private void printStats(String fname){
444 if(fname==null){return;}
445 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, false);
446 bsw.start();
447
448 if(callCDS){
449 bsw.println("Gene Starts Made: \t "+Tools.padLeft(geneStartsMade, 12));
450 bsw.println("Gene Stops Made: \t "+Tools.padLeft(geneStopsMade, 12));
451 bsw.println("Gene Starts Retained: \t "+Tools.padLeft(geneStartsRetained, 12));
452 bsw.println("Gene Stops Retained: \t "+Tools.padLeft(geneStopsRetained, 12));
453 bsw.println("CDS Out: \t "+Tools.padLeft(geneStartsOut, 12));
454 }
455 if(call16S){bsw.println("16S Out: \t "+Tools.padLeft(r16SOut, 12));}
456 if(call18S){bsw.println("18S Out: \t "+Tools.padLeft(r18SOut, 12));}
457 if(call23S){bsw.println("23S Out: \t "+Tools.padLeft(r23SOut, 12));}
458 if(call5S){bsw.println("5S Out: \t "+Tools.padLeft(r5SOut, 12));}
459 if(calltRNA){bsw.println("tRNA Out: \t "+Tools.padLeft(tRNAOut, 12));}
460
461 if(callCDS){
462 bsw.println();
463 bsw.println("All ORF Stats:");
464 bsw.print(stCds.toString());
465
466 bsw.println();
467 bsw.println("Retained ORF Stats:");
468 bsw.print(stCds2.toString());
469
470 bsw.println();
471 bsw.println("Called ORF Stats:");
472 stCdsPass.genomeSize=basesIn;
473 bsw.print(stCdsPass.toString());
474 }
475
476 if(call16S){
477 bsw.println();
478 bsw.println("Called 16S Stats:");
479 bsw.print(st16s.toString());
480 }
481 if(call23S){
482 bsw.println();
483 bsw.println("Called 23S Stats:");
484 bsw.print(st23s.toString());
485 }
486 if(call5S){
487 bsw.println();
488 bsw.println("Called 5S Stats:");
489 bsw.print(st5s.toString());
490 }
491 if(call18S){
492 bsw.println();
493 bsw.println("Called 18S Stats:");
494 bsw.print(st18s.toString());
495 }
496 bsw.poisonAndWait();
497 }
498
499 private void printStatsJson(String fname){
500 if(fname==null){return;}
501
502 JsonObject outer=new JsonObject();
503
504 {
505 JsonObject jo=new JsonObject();
506 if(callCDS){
507 jo.add("Gene Starts Made", geneStartsMade);
508 jo.add("Gene Stops Made", geneStopsMade);
509 jo.add("Gene Starts Retained", geneStartsRetained);
510 jo.add("Gene Stops Retained", geneStopsRetained);
511 jo.add("CDS Out", geneStartsOut);
512 }
513 if(call16S){jo.add("16S Out", r16SOut);}
514 if(call18S){jo.add("18S Out", r18SOut);}
515 if(call23S){jo.add("23S Out", r23SOut);}
516 if(call5S){jo.add("5S Out", r5SOut);}
517 if(calltRNA){jo.add("tRNA Out", tRNAOut);}
518 outer.add("Overall", jo);
519 }
520
521 if(callCDS){
522 outer.add("All ORF Stats", stCds.toJson());
523 outer.add("Retained ORF Stats", stCds2.toJson());
524 stCdsPass.genomeSize=basesIn;
525 outer.add("Called ORF Stats", stCdsPass.toJson());
526 }
527
528 if(call16S){outer.add("Called 16S Stats", st16s.toJson());}
529 if(call18S){outer.add("Called 18S Stats", st18s.toJson());}
530 if(call23S){outer.add("Called 23S Stats", st23s.toJson());}
531 if(call5S){outer.add("Called 5S Stats", st5s.toJson());}
532 if(calltRNA){outer.add("Called tRNA Stats", sttRNA.toJson());}
533
534
535 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, false);
536 bsw.start();
537 bsw.println(outer.toText());
538 bsw.poisonAndWait();
539 }
540
541 private void printHist(String fname){
542 if(fname==null || geneHist==null){return;}
543 ByteStreamWriter bsw=new ByteStreamWriter(fname, overwrite, append, false);
544 bsw.start();
545 long sum=Tools.sum(geneHist);
546 double mean=Tools.averageHistogram(geneHist)*geneHistDiv;
547 int median=Tools.medianHistogram(geneHist)*geneHistDiv;
548 double std=Tools.standardDeviationHistogram(geneHist)*geneHistDiv;
549 bsw.println("#Gene Length Histogram");
550 bsw.print("#Genes:\t").println(sum);
551 bsw.print("#Mean:\t").println(mean, 4);
552 bsw.print("#Median:\t").println(median);
553 bsw.print("#STDDev:\t").println(std, 4);
554 bsw.print("#Length\tCount\n");
555 long cum=0;
556 for(int i=0; i<geneHist.length && cum<sum; i++){
557 int len=i*geneHistDiv;
558 long count=geneHist[i];
559 cum+=count;
560 if(count>0 || printZeroCountHist){
561 bsw.print(len).tab().println(count);
562 }
563 }
564 bsw.poisonAndWait();
565 }
566
567 private ConcurrentReadInputStream makeCris(String fname){
568 FileFormat ffin=FileFormat.testInput(fname, FileFormat.FA, null, true, true);
569 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, false, ffin, null);
570 cris.start(); //Start the stream
571 if(verbose){outstream.println("Started cris");}
572 return cris;
573 }
574
575 /** Spawn process threads */
576 private void spawnThreads(final ConcurrentReadInputStream cris, final ByteStreamWriter bsw,
577 ConcurrentReadOutputStream rosAmino, ConcurrentReadOutputStream ros16S, ConcurrentReadOutputStream ros18S, GeneModel pgm){
578
579 //Do anything necessary prior to processing
580
581 //Determine how many threads may be used
582 final int threads=Shared.threads();
583
584 //Fill a list with ProcessThreads
585 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
586 for(int i=0; i<threads; i++){
587 alpt.add(new ProcessThread(cris, bsw, rosAmino, ros16S, ros18S, pgm, minLen, i));
588 }
589
590 //Start the threads
591 for(ProcessThread pt : alpt){
592 pt.start();
593 }
594
595 //Wait for threads to finish
596 waitForThreads(alpt);
597
598 //Do anything necessary after processing
599
600 }
601
602 private void waitForThreads(ArrayList<ProcessThread> alpt){
603
604 //Wait for completion of all threads
605 boolean success=true;
606 for(ProcessThread pt : alpt){
607
608 //Wait until this thread has terminated
609 while(pt.getState()!=Thread.State.TERMINATED){
610 try {
611 //Attempt a join operation
612 pt.join();
613 } catch (InterruptedException e) {
614 //Potentially handle this, if it is expected to occur
615 e.printStackTrace();
616 }
617 }
618
619 //Accumulate per-thread statistics
620 readsIn+=pt.readsInT;
621 basesIn+=pt.basesInT;
622 genesOut+=pt.genesOutT;
623 bytesOut+=pt.bytesOutT;
624
625 geneStopsMade+=pt.caller.geneStopsMade;
626 geneStartsMade+=pt.caller.geneStartsMade;
627 geneStartsRetained+=pt.caller.geneStartsRetained;
628 geneStopsRetained+=pt.caller.geneStopsRetained;
629 geneStartsOut+=pt.caller.geneStartsOut;
630
631 r16SOut+=pt.caller.r16SOut;
632 r18SOut+=pt.caller.r18SOut;
633 r23SOut+=pt.caller.r23SOut;
634 r5SOut+=pt.caller.r5SOut;
635 tRNAOut+=pt.caller.tRNAOut;
636
637 stCds.add(pt.caller.stCds);
638 stCds2.add(pt.caller.stCds2);
639 // stCdsPass.add(pt.caller.stCdsPass);
640
641 for(int i=0; i<trackers.length; i++){
642 trackers[i].add(pt.caller.trackers[i]);
643 }
644
645 if(geneHist!=null){Tools.add(geneHist, pt.geneHistT);}
646
647 success&=pt.success;
648 }
649
650 //Track whether any threads failed
651 if(!success){errorState=true;}
652 }
653
654 /*--------------------------------------------------------------*/
655 /*---------------- Inner Methods ----------------*/
656 /*--------------------------------------------------------------*/
657
658 private static ByteStreamWriter makeBSW(FileFormat ff){
659 if(ff==null){return null;}
660 ByteStreamWriter bsw=new ByteStreamWriter(ff);
661 bsw.start();
662 return bsw;
663 }
664
665 private ConcurrentReadOutputStream makeCros(FileFormat ff){
666 if(ff==null){return null;}
667
668 //Select output buffer size based on whether it needs to be ordered
669 final int buff=(ordered ? Tools.mid(4, 64, (Shared.threads()*2)/3) : 4);
670
671 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ff, null, buff, null, false);
672 ros.start(); //Start the stream
673 return ros;
674 }
675
676 /*--------------------------------------------------------------*/
677 /*---------------- Inner Classes ----------------*/
678 /*--------------------------------------------------------------*/
679
680 /** This class is static to prevent accidental writing to shared variables.
681 * It is safe to remove the static modifier. */
682 private class ProcessThread extends Thread {
683
684 //Constructor
685 ProcessThread(final ConcurrentReadInputStream cris_, final ByteStreamWriter bsw_,
686 ConcurrentReadOutputStream rosAmino_, ConcurrentReadOutputStream ros16S_, ConcurrentReadOutputStream ros18S_,
687 GeneModel pgm_, final int minLen, final int tid_){
688 cris=cris_;
689 bsw=bsw_;
690 rosAmino=rosAmino_;
691 ros16S=ros16S_;
692 ros18S=ros18S_;
693 pgm=pgm_;
694 tid=tid_;
695 geneHistT=(geneHistBins>1 ? new long[geneHistBins] : null);
696 caller=new GeneCaller(minLen, maxOverlapSameStrand, maxOverlapOppositeStrand,
697 minStartScore, minStopScore, minKmerScore, minOrfScore, minAvgScore, pgm);
698 }
699
700 //Called by start()
701 @Override
702 public void run(){
703 //Do anything necessary prior to processing
704
705 //Process the reads
706 processInner();
707
708 //Do anything necessary after processing
709
710 //Indicate successful exit status
711 success=true;
712 }
713
714 /** Iterate through the reads */
715 void processInner(){
716
717 //Grab the first ListNum of reads
718 ListNum<Read> ln=cris.nextList();
719
720 //Check to ensure pairing is as expected
721 if(ln!=null && !ln.isEmpty()){
722 Read r=ln.get(0);
723 // assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); //Disabled due to non-static access
724 }
725
726 //As long as there is a nonempty read list...
727 while(ln!=null && ln.size()>0){
728 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
729
730 processList(ln);
731
732 //Fetch a new list
733 ln=cris.nextList();
734 }
735
736 //Notify the input stream that the final list was used
737 if(ln!=null){
738 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
739 }
740 }
741
742 void processList(ListNum<Read> ln){
743 //Grab the actual read list from the ListNum
744 final ArrayList<Read> reads=ln.list;
745
746 // System.err.println(reads.size());
747
748 ArrayList<Orf> orfList=new ArrayList<Orf>();
749
750 //Loop through each read in the list
751 for(int idx=0; idx<reads.size(); idx++){
752 Read r1=reads.get(idx);
753 Read r2=r1.mate;
754
755 //Validate reads in worker threads
756 if(!r1.validated()){r1.validate(true);}
757 if(r2!=null && !r2.validated()){r2.validate(true);}
758
759 //Track the initial length for statistics
760 final int initialLength1=r1.length();
761 final int initialLength2=r1.mateLength();
762
763 //Increment counters
764 readsInT+=r1.pairCount();
765 basesInT+=initialLength1+initialLength2;
766
767 if(r2!=null){
768 if(merge){
769 final int insert=BBMerge.findOverlapStrict(r1, r2, false);
770 if(insert>0){
771 r2.reverseComplement();
772 r1=r1.joinRead(insert);
773 r2=null;
774 }
775 }else if(ecco){
776 BBMerge.findOverlapStrict(r1, r2, true);
777 }
778 }
779
780 {
781 //Reads are processed in this block.
782 {
783 ArrayList<Orf> list=processRead(r1);
784 if(list!=null){orfList.addAll(list);}
785 }
786 if(r2!=null){
787 ArrayList<Orf> list=processRead(r2);
788 if(list!=null){orfList.addAll(list);}
789 }
790 }
791 }
792
793 genesOutT+=orfList.size();
794 ByteBuilder bb=new ByteBuilder();
795
796 if(bsw!=null){
797 if(bsw.ordered){
798 for(Orf orf : orfList){
799 orf.appendGff(bb);
800 bb.nl();
801 }
802 bsw.add(bb, ln.id);
803 bytesOutT+=bb.length();
804 }else{
805 for(Orf orf : orfList){
806 orf.appendGff(bb);
807 bb.nl();
808 // if(bb.length()>=32000){
809 // bytesOutT+=bb.length();
810 // bsw.addJob(bb);
811 // bb=new ByteBuilder();
812 // }
813 }
814 if(bb.length()>0){
815 bsw.addJob(bb);
816 bytesOutT+=bb.length();
817 }
818 }
819 }
820
821 //Notify the input stream that the list was used
822 cris.returnList(ln);
823 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
824 }
825
826 /**
827 * Process a read or a read pair.
828 * @param r1 Read 1
829 * @param r2 Read 2 (may be null)
830 * @return True if the reads should be kept, false if they should be discarded.
831 */
832 ArrayList<Orf> processRead(final Read r){
833 ArrayList<Orf> list=caller.callGenes(r, pgm);
834
835 if(geneHistT!=null && list!=null){
836 for(Orf o : list){
837 int bin=Tools.min(geneHistT.length-1, o.length()/geneHistDiv);
838 geneHistT[bin]++;
839 }
840 }
841
842 if(ros16S!=null){
843 if(list!=null && !list.isEmpty()){
844 // System.err.println("Looking for 16S.");
845 ArrayList<Read> ssu=fetchType(r, list, r16S);
846 if(ssu!=null && !ssu.isEmpty()){
847 // System.err.println("Found "+ssu.size()+" 16S.");
848 ros16S.add(ssu, r.numericID);
849 }
850 }
851 }
852 if(ros18S!=null){
853 if(list!=null && !list.isEmpty()){
854 ArrayList<Read> ssu=fetchType(r, list, r18S);
855 if(ssu!=null && !ssu.isEmpty()){ros18S.add(ssu, r.numericID);}
856 }
857 }
858
859 if(rosAmino!=null){
860 if(mode==TRANSLATE){
861 if(list!=null && !list.isEmpty()){
862 ArrayList<Read> prots=translate(r, list);
863 if(prots!=null){rosAmino.add(prots, r.numericID);}
864 }
865 }else if(mode==RETRANSLATE) {
866 if(list!=null && !list.isEmpty()){
867 ArrayList<Read> prots=translate(r, list);
868 ArrayList<Read> ret=detranslate(prots);
869 if(ret!=null){rosAmino.add(ret, r.numericID);}
870 }
871 }else if(mode==RECODE) {
872 if(list!=null && !list.isEmpty()){
873 Read recoded=recode(r, list);
874 r.mate=null;
875 ArrayList<Read> rec=new ArrayList<Read>(1);
876 rec.add(recoded);
877 if(rec!=null){rosAmino.add(rec, r.numericID);}
878 }
879 }else{
880 assert(false) : mode;
881 }
882 }
883
884 return list;
885 }
886
887 /** Number of reads processed by this thread */
888 protected long readsInT=0;
889 /** Number of bases processed by this thread */
890 protected long basesInT=0;
891
892 /** Number of genes called by this thread */
893 protected long genesOutT=0;
894 /** Number of bytes written by this thread */
895 protected long bytesOutT=0;
896
897 final long[] geneHistT;
898
899 protected ConcurrentReadOutputStream rosAmino;
900 protected ConcurrentReadOutputStream ros16S;
901 protected ConcurrentReadOutputStream ros18S;
902
903 /** True only if this thread has completed successfully */
904 boolean success=false;
905
906 /** Shared input stream */
907 private final ConcurrentReadInputStream cris;
908 /** Shared output stream */
909 private final ByteStreamWriter bsw;
910 /** Gene Model for annotation (not really needed) */
911 private final GeneModel pgm;
912 /** Gene Caller for annotation */
913 final GeneCaller caller;
914 /** Thread ID */
915 final int tid;
916 }
917
918 public static ArrayList<Read> fetchType(final Read r, final ArrayList<Orf> list, int type){
919 if(list==null || list.isEmpty()){return null;}
920 ArrayList<Read> ret=new ArrayList<Read>(list.size());
921 for(int strand=0; strand<2; strand++){
922 for(Orf orf : list){
923 if(orf.strand==strand && orf.type==type){
924 Read sequence=fetch(orf, r.bases, r.id);
925 ret.add(sequence);
926 }
927 }
928 r.reverseComplement();
929 }
930 return (ret.size()>0 ? ret : null);
931 }
932
933 public static ArrayList<Read> translate(final Read r, final ArrayList<Orf> list){
934 if(list==null || list.isEmpty()){return null;}
935 ArrayList<Read> prots=new ArrayList<Read>(list.size());
936 for(int strand=0; strand<2; strand++){
937 for(Orf orf : list){
938 if(orf.strand==strand && orf.type==CDS){
939 Read aa=translate(orf, r.bases, r.id);
940 prots.add(aa);
941 }
942 }
943 r.reverseComplement();
944 }
945 return prots.isEmpty() ? null : prots;
946 }
947
948 public static Read recode(final Read r, final ArrayList<Orf> list){
949 if(list==null || list.isEmpty()){return r;}
950 for(int strand=0; strand<2; strand++){
951 for(Orf orf : list){
952 if(orf.strand==strand && orf.type==CDS){
953 recode(orf, r.bases);
954 }
955 }
956 r.reverseComplement();
957 }
958 return r;
959 }
960
961 public static ArrayList<Read> detranslate(final ArrayList<Read> prots){
962 if(prots==null || prots.isEmpty()){return null;}
963 ArrayList<Read> nucs=new ArrayList<Read>(prots.size());
964 for(int strand=0; strand<2; strand++){
965 for(Read prot : prots){
966 Read nuc=detranslate(prot);
967 nucs.add(nuc);
968 }
969 }
970 return nucs;
971 }
972
973 public static Read translate(Orf orf, byte[] bases, String id){
974 // assert(orf.length()%3==0) : orf.length(); //Happens sometimes on genes that go off the end, perhaps
975 if(orf.strand==1){orf.flip();}
976 byte[] acids=AminoAcid.toAAs(bases, orf.start, orf.stop);
977 if(orf.strand==1){orf.flip();}
978 Read r=new Read(acids, null, id+"\t"+(Shared.strandCodes[orf.strand]+"\t"+orf.start+"-"+orf.stop), 0, Read.AAMASK);
979 // assert((r.length()+1)*3==orf.length());
980 return r;
981 }
982
983 public static Read fetch(Orf orf, Read source){
984 assert(orf.start>=0 && orf.stop<source.length()) : source.length()+"\n"+orf;
985 if(orf.strand==1){source.reverseComplement();}
986 Read r=fetch(orf, source.bases, source.id);
987 if(orf.strand==1){source.reverseComplement();}
988 return r;
989 }
990
991 public static Read fetch(Orf orf, byte[] bases, String id){
992 assert(orf.start>=0 && orf.stop<bases.length) : bases.length+"\n"+orf;
993 if(orf.strand==1){orf.flip();}
994 byte[] sub=Arrays.copyOfRange(bases, orf.start, orf.stop+1);
995 if(orf.strand==1){orf.flip();}
996 Read r=new Read(sub, null, id+"\t"+(Shared.strandCodes[orf.strand]+"\t"+orf.start+"-"+orf.stop), 0, 0);
997 assert(r.length()==orf.length()) : r.length()+", "+orf.length();
998 return r;
999 }
1000
1001 public static void recode(Orf orf, byte[] bases){
1002 if(orf.strand==1){orf.flip();}
1003 byte[] acids=AminoAcid.toAAs(bases, orf.start, orf.stop);
1004 for(int apos=0, bpos=orf.start; apos<acids.length; apos++){
1005 byte aa=acids[apos];
1006 int number=AminoAcid.acidToNumber[aa];
1007 String codon=(number>=0 ? AminoAcid.canonicalCodons[number] : "NNN");
1008 for(int i=0; i<3; i++, bpos++) {
1009 bases[bpos]=(byte)codon.charAt(i);
1010 }
1011 }
1012 if(orf.strand==1){orf.flip();}
1013 }
1014
1015 public static Read detranslate(Read prot){
1016 ByteBuilder bb=new ByteBuilder(prot.length()*3);
1017 for(byte aa : prot.bases){
1018 int number=AminoAcid.acidToNumber[aa];
1019 String codon=(number>=0 ? AminoAcid.canonicalCodons[number] : "NNN");
1020 bb.append(codon);
1021 }
1022 Read r=new Read(bb.array, null, prot.id, prot.numericID, 0);
1023 return r;
1024 }
1025
1026 /*--------------------------------------------------------------*/
1027 /*---------------- Fields ----------------*/
1028 /*--------------------------------------------------------------*/
1029
1030 public static GeneCaller makeGeneCaller(GeneModel pgm){
1031 GeneCaller caller=new GeneCaller(minLen, maxOverlapSameStrand, maxOverlapOppositeStrand,
1032 minStartScore, minStopScore, minKmerScore, minOrfScore, minAvgScore, pgm);
1033 return caller;
1034 }
1035
1036 private long maxReads=-1;
1037 private boolean merge;
1038 private boolean ecco;
1039
1040 private long readsIn=0;
1041 private long basesIn=0;
1042 private long genesOut=0;
1043 private long bytesOut=0;
1044
1045 private static int minLen=80;//Actually a much higher value like 200 seems optimal compared to NCBI
1046 private static int maxOverlapSameStrand=80;
1047 private static int maxOverlapOppositeStrand=110;
1048
1049 /* for kinnercds=6 */
1050 // private static float minStartScore=-0.10f;
1051 // private static float minStopScore=-0.5f;//Not useful; disabled
1052 // private static float minKmerScore=0.04f;//Does not seem useful.
1053 // private static float minOrfScore=40f; //Higher increases SNR dramatically but reduces TP rate
1054 // private static float minAvgScore=0.08f; //Not very effective
1055
1056 /* for kinnercds=7 */
1057 private static float minStartScore=-0.10f;
1058 private static float minStopScore=-0.5f;//Not useful; disabled
1059 private static float minKmerScore=0.02f;//Does not seem useful.
1060 private static float minOrfScore=50f; //Higher increases SNR dramatically but reduces TP rate
1061 private static float minAvgScore=0.08f; //Not very effective
1062
1063 long geneStopsMade=0;
1064 long geneStartsMade=0;
1065 long geneStartsRetained=0;
1066 long geneStopsRetained=0;
1067 long geneStartsOut=0;
1068
1069 long tRNAOut=0;
1070 long r16SOut=0;
1071 long r23SOut=0;
1072 long r5SOut=0;
1073 long r18SOut=0;
1074
1075 ScoreTracker stCds=new ScoreTracker(CDS);
1076 ScoreTracker stCds2=new ScoreTracker(CDS);
1077 ScoreTracker stCdsPass=new ScoreTracker(CDS);
1078 ScoreTracker sttRNA=new ScoreTracker(tRNA);
1079 ScoreTracker st16s=new ScoreTracker(r16S);
1080 ScoreTracker st23s=new ScoreTracker(r23S);
1081 ScoreTracker st5s=new ScoreTracker(r5S);
1082 ScoreTracker st18s=new ScoreTracker(r18S);
1083
1084 ScoreTracker[] trackers=new ScoreTracker[] {stCdsPass, sttRNA, st16s, st23s, st5s, st18s};
1085
1086 private int geneHistBins=2000;
1087 private int geneHistDiv=20;
1088 private final long[] geneHist;
1089 private boolean printZeroCountHist=false;
1090
1091 /*--------------------------------------------------------------*/
1092
1093 private ArrayList<String> fnaList=new ArrayList<String>();
1094 private ArrayList<String> pgmList=new ArrayList<String>();
1095 private String outGff=null;
1096 private String outAmino=null;
1097 private String out16S=null;
1098 private String out18S=null;
1099 private String compareToGff=null;
1100 private String outStats="stderr";
1101 private String geneHistFile=null;
1102 private boolean json_out=false;
1103
1104 /*--------------------------------------------------------------*/
1105 /*---------------- Final Fields ----------------*/
1106 /*--------------------------------------------------------------*/
1107
1108 private final FileFormat ffoutGff;
1109 private final FileFormat ffoutAmino;
1110 private final FileFormat ffout16S;
1111 private final FileFormat ffout18S;
1112
1113 /** Determines how sequence is processed if it will be output */
1114 int mode=TRANSLATE;
1115
1116 /** Translate nucleotides to amino acids */
1117 private static final int TRANSLATE=1;
1118 /** Translate nucleotides to amino acids,
1119 * then translate back to nucleotides */
1120 private static final int RETRANSLATE=2;
1121 /** Re-encode coding regions of nucleotide
1122 * sequences as a canonical codons */
1123 private static final int RECODE=3;
1124
1125 /*--------------------------------------------------------------*/
1126
1127 private PrintStream outstream=System.err;
1128 public boolean verbose=false;
1129 public boolean errorState=false;
1130 private boolean overwrite=false;
1131 private boolean append=false;
1132 private boolean ordered=false; //this is OK sometimes, but sometimes hangs (e.g. on RefSeq mito), possibly if a sequence produces nothing.
1133 //To fix it, just ensure functions like translate always produce an array, even if it is empty.
1134
1135 }