comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/SplitByTaxa.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package tax;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.HashMap;
7
8 import fileIO.ByteFile;
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import shared.Parse;
12 import shared.Parser;
13 import shared.PreParser;
14 import shared.ReadStats;
15 import shared.Shared;
16 import shared.Timer;
17 import shared.Tools;
18 import stream.ConcurrentReadInputStream;
19 import stream.ConcurrentReadOutputStream;
20 import stream.FASTQ;
21 import stream.FastaReadInputStream;
22 import stream.Read;
23 import structures.ListNum;
24
25 /**
26 * Filters sequences according to their taxonomy,
27 * as determined by the sequence name. Sequences should
28 * be labeled with a gi number or NCBI taxID.
29 *
30 * @author Brian Bushnell
31 * @date November 23, 2015
32 *
33 */
34 public class SplitByTaxa {
35
36 /*--------------------------------------------------------------*/
37 /*---------------- Initialization ----------------*/
38 /*--------------------------------------------------------------*/
39
40 /**
41 * Code entrance from the command line.
42 * @param args Command line arguments
43 */
44 public static void main(String[] args){
45 Timer t=new Timer();
46 SplitByTaxa x=new SplitByTaxa(args);
47 x.process(t);
48
49 //Close the print stream if it was redirected
50 Shared.closeStream(x.outstream);
51 }
52
53 /**
54 * Constructor.
55 * @param args Command line arguments
56 */
57 public SplitByTaxa(String[] args){
58
59 {//Preparse block for help, config files, and outstream
60 PreParser pp=new PreParser(args, getClass(), false);
61 args=pp.args;
62 outstream=pp.outstream;
63 }
64
65 //Set some shared static variables
66 Shared.capBuffers(4);
67 ReadWrite.USE_UNPIGZ=true;
68 ReadWrite.USE_PIGZ=false;
69 ReadWrite.USE_GZIP=false;
70 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
71
72 boolean setInterleaved=false; //Whether interleaved was explicitly set.
73 String giTableFile=null;
74 String taxTreeFile=null;
75
76 //Create a parser object
77 Parser parser=new Parser();
78
79 //Parse each argument
80 for(int i=0; i<args.length; i++){
81 String arg=args[i];
82
83 //Break arguments into their constituent parts, in the form of "a=b"
84 String[] split=arg.split("=");
85 String a=split[0].toLowerCase();
86 String b=split.length>1 ? split[1] : null;
87
88 if(parser.parse(arg, a, b)){//Parse standard flags in the parser
89 //do nothing
90 }else if(a.equals("verbose")){
91 verbose=Parse.parseBoolean(b);
92 }else if(a.equals("taxlevel") || a.equals("tl") || a.equals("level") || a.equals("lv")){
93 taxLevelE=TaxTree.parseLevelExtended(b);
94 }else if(a.equals("table") || a.equals("gi") || a.equals("gitable")){
95 giTableFile=b;
96 }else if(a.equals("tree") || a.equals("taxtree")){
97 taxTreeFile=b;
98 }else{
99 outstream.println("Unknown parameter "+args[i]);
100 assert(false) : "Unknown parameter "+args[i];
101 // throw new RuntimeException("Unknown parameter "+args[i]);
102 }
103 }
104
105 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();}
106 if("auto".equalsIgnoreCase(giTableFile)){giTableFile=TaxTree.defaultTableFile();}
107
108 {//Process parser fields
109 Parser.processQuality();
110
111 maxReads=parser.maxReads;
112
113 overwrite=ReadStats.overwrite=parser.overwrite;
114 append=ReadStats.append=parser.append;
115 setInterleaved=parser.setInterleaved;
116
117 in1=parser.in1;
118 in2=parser.in2;
119
120 out1=parser.out1;
121 out2=parser.out2;
122
123 extin=parser.extin;
124 extout=parser.extout;
125 }
126
127 //Do input file # replacement
128 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
129 in2=in1.replace("#", "2");
130 in1=in1.replace("#", "1");
131 }
132
133 //Do output file # replacement
134 if(out1!=null && out2==null && out1.indexOf('#')>-1){
135 out2=out1.replace("#", "2");
136 out1=out1.replace("#", "1");
137 }
138
139 assert(out1==null || out1.contains("%")) : "Output filename must contain % symbol.";
140 assert(out2==null || out2.contains("%")) : "Output filename must contain % symbol.";
141 assert(taxTreeFile!=null) : "This requires a taxtree file. On NERSC systems, set tree=auto.";
142
143 //Adjust interleaved detection based on the number of input files
144 if(in2!=null){
145 if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
146 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
147 }
148
149 assert(FastaReadInputStream.settingsOK());
150
151 //Ensure there is an input file
152 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
153
154 //Adjust the number of threads for input file reading
155 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
156 ByteFile.FORCE_MODE_BF2=true;
157 }
158
159 //Ensure out2 is not set without out1
160 if(out1==null && out2!=null){throw new RuntimeException("Error - cannot define out2 without defining out1.");}
161
162 //Adjust interleaved settings based on number of output files
163 if(!setInterleaved){
164 assert(in1!=null && (out1!=null || out2==null)) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n";
165 if(in2!=null){ //If there are 2 input streams.
166 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
167 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
168 }else{ //There is one input stream.
169 if(out2!=null){
170 FASTQ.FORCE_INTERLEAVED=true;
171 FASTQ.TEST_INTERLEAVED=false;
172 outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED);
173 }
174 }
175 }
176
177 //Ensure output files can be written
178 if(!Tools.testOutputFiles(overwrite, append, false, out1, out2)){
179 outstream.println((out1==null)+", "+(out2==null)+", "+out1+", "+out2);
180 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n");
181 }
182
183 //Ensure input files can be read
184 if(!Tools.testInputFiles(false, true, in1, in2)){
185 throw new RuntimeException("\nCan't read some input files.\n");
186 }
187
188 //Ensure that no file was specified multiple times
189 if(!Tools.testForDuplicateFiles(true, in1, in2, out1, out2)){
190 throw new RuntimeException("\nSome file names were specified multiple times.\n");
191 }
192
193 //Create input FileFormat objects
194 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
195 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
196
197 TaxFilter.loadGiTable(giTableFile);
198 tree=TaxFilter.loadTree(taxTreeFile);
199 }
200
201 /*--------------------------------------------------------------*/
202 /*---------------- Outer Methods ----------------*/
203 /*--------------------------------------------------------------*/
204
205 /** Create read streams and process all data */
206 public void process(Timer t){
207
208 //Create a read input stream
209 final ConcurrentReadInputStream cris;
210 {
211 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
212 cris.start(); //Start the stream
213 if(verbose){outstream.println("Started cris");}
214 }
215 boolean paired=cris.paired();
216 if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
217
218 //Reset counters
219 readsProcessed=0;
220 basesProcessed=0;
221
222 final HashMap<String, ConcurrentReadOutputStream> map=new HashMap<String, ConcurrentReadOutputStream>();
223
224 //Process the read stream
225 processInner(cris, map);
226
227 if(verbose){outstream.println("Finished; closing streams.");}
228
229 //Write anything that was accumulated by ReadStats
230 errorState|=ReadStats.writeAll();
231 //Close the read streams
232 errorState|=ReadWrite.closeStream(cris);
233
234 for(ConcurrentReadOutputStream ros : map.values()){
235 ReadWrite.closeStream(ros);
236 }
237
238 //Report timing and results
239 t.stop();
240 outstream.println("Reads In: \t"+readsProcessed+" reads \t"+basesProcessed+" bases");
241 outstream.println();
242 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
243
244 //Throw an exception of there was an error in a thread
245 if(errorState){
246 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
247 }
248 }
249
250 /** Iterate through the reads */
251 void processInner(final ConcurrentReadInputStream cris, HashMap<String, ConcurrentReadOutputStream> map){
252
253 {
254 //Grab the first ListNum of reads
255 ListNum<Read> ln=cris.nextList();
256 //Grab the actual read list from the ListNum
257 ArrayList<Read> reads=(ln!=null ? ln.list : null);
258
259 //Check to ensure pairing is as expected
260 if(reads!=null && !reads.isEmpty()){
261 Read r=reads.get(0);
262 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
263 }
264
265 //As long as there is a nonempty read list...
266 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
267 if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
268
269 //Loop through each read in the list
270 for(int idx=0; idx<reads.size(); idx++){
271 final Read r1=reads.get(idx);
272
273 //Track the initial length for statistics
274 final int initialLength1=r1.length();
275 final int initialLength2=(r1.mateLength());
276
277 //Increment counters
278 readsProcessed+=r1.pairCount();
279 basesProcessed+=initialLength1+initialLength2;
280
281 TaxNode tn=tree.parseNodeFromHeader(r1.id, true);
282 if(tn==null){tn=tree.getNodeByName(r1.id);}
283 if(tn==null){tn=unknown;}
284 while(tn.levelExtended<taxLevelE && tn.id!=tn.pid){tn=tree.getNode(tn.pid);}
285
286 if(out1!=null){
287 ConcurrentReadOutputStream ros=map.get(tn.name);
288 if(ros==null){
289 final int buff=4;
290 FileFormat ffout1=null, ffout2=null;
291 ffout1=FileFormat.testOutput(out1.replaceFirst("%", tn.name.replaceAll("\\s+", "_").replaceAll("[/\\\\]", "")), FileFormat.FASTQ, extout, false, overwrite, append, false);
292 if(out2!=null){ffout2=FileFormat.testOutput(out2.replaceFirst("%", tn.name.replaceAll("\\s+", "_").replaceAll("[/\\\\]", "")), FileFormat.FASTQ, extout, false, overwrite, append, false);}
293 ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, null, null, buff, null, false);
294 ros.start(); //Start the stream
295 map.put(tn.name, ros);
296 }
297 ArrayList<Read> temp=new ArrayList<Read>(1); //Kind of inefficient
298 temp.add(r1);
299 ros.add(temp, 0);
300 }
301 }
302
303 //Notify the input stream that the list was used
304 cris.returnList(ln);
305 if(verbose){outstream.println("Returned a list.");}
306
307 //Fetch a new list
308 ln=cris.nextList();
309 reads=(ln!=null ? ln.list : null);
310 }
311
312 //Notify the input stream that the final list was used
313 if(ln!=null){
314 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
315 }
316 }
317
318 //Do anything necessary after processing
319
320 }
321
322 /*--------------------------------------------------------------*/
323 /*---------------- Inner Methods ----------------*/
324 /*--------------------------------------------------------------*/
325
326 /*--------------------------------------------------------------*/
327 /*---------------- Fields ----------------*/
328 /*--------------------------------------------------------------*/
329
330 /** Primary input file path */
331 private String in1=null;
332 /** Secondary input file path */
333 private String in2=null;
334
335 /** Primary output file path */
336 private String out1=null;
337 /** Secondary output file path */
338 private String out2=null;
339
340 /** Override input file extension */
341 private String extin=null;
342 /** Override output file extension */
343 private String extout=null;
344
345 /** The actual filter */
346 private int taxLevelE=TaxTree.stringToLevelExtended("phylum");
347
348 /*--------------------------------------------------------------*/
349
350 /** Number of reads processed */
351 protected long readsProcessed=0;
352 /** Number of bases processed */
353 protected long basesProcessed=0;
354
355 /** Quit after processing this many input reads; -1 means no limit */
356 private long maxReads=-1;
357
358 /*--------------------------------------------------------------*/
359 /*---------------- Final Fields ----------------*/
360 /*--------------------------------------------------------------*/
361
362 /** Primary input file */
363 private final FileFormat ffin1;
364 /** Secondary input file */
365 private final FileFormat ffin2;
366
367 private final TaxTree tree;
368
369 private final TaxNode unknown=new TaxNode(-99, -99, TaxTree.LIFE, TaxTree.LIFE_E, "UNKNOWN");
370
371 /*--------------------------------------------------------------*/
372 /*---------------- Common Fields ----------------*/
373 /*--------------------------------------------------------------*/
374
375 /** Print status messages to this output stream */
376 private PrintStream outstream=System.err;
377 /** Print verbose messages */
378 public static boolean verbose=false;
379 /** True if an error was encountered */
380 public boolean errorState=false;
381 /** Overwrite existing output files */
382 private boolean overwrite=false;
383 /** Append to existing output files */
384 private boolean append=false;
385 /** This flag has no effect on singlethreaded programs */
386 private final boolean ordered=false;
387
388 }