comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/clump/KmerReduce.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package clump;
2
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Random;
7
8 import assemble.AbstractRemoveThread;
9 import dna.AminoAcid;
10 import fileIO.ByteFile;
11 import fileIO.FileFormat;
12 import fileIO.ReadWrite;
13 import jgi.BBMerge;
14 import kmer.KmerTableSet;
15 import shared.KillSwitch;
16 import shared.Parse;
17 import shared.Parser;
18 import shared.PreParser;
19 import shared.ReadStats;
20 import shared.Shared;
21 import shared.Timer;
22 import shared.Tools;
23 import stream.ConcurrentReadInputStream;
24 import stream.ConcurrentReadOutputStream;
25 import stream.FASTQ;
26 import stream.FastaReadInputStream;
27 import stream.Read;
28 import structures.ListNum;
29
30 /**
31 * Reduces reads to their feature kmer.
32 * @author Brian Bushnell
33 * @date Nov 10, 2015
34 *
35 */
36 public class KmerReduce {
37
38 /*--------------------------------------------------------------*/
39 /*---------------- Static Methods ----------------*/
40 /*--------------------------------------------------------------*/
41
42 /**
43 * Code entrance from the command line.
44 * @param args Command line arguments
45 */
46 public static void main(String[] args){
47 final boolean pigz=ReadWrite.USE_PIGZ, unpigz=ReadWrite.USE_UNPIGZ;
48 Timer t=new Timer();
49 KmerReduce x=new KmerReduce(args);
50 x.process(t);
51 ReadWrite.USE_PIGZ=pigz;
52 ReadWrite.USE_UNPIGZ=unpigz;
53
54 //Close the print stream if it was redirected
55 Shared.closeStream(x.outstream);
56 }
57
58 /**
59 * @param fname0 Input filename of reads
60 * @param k Kmer length
61 * @param cutoff Minimum count to retain
62 * @return Set of pivot kmers
63 */
64 public static KmerTableSet getValidKmersFromReads(final String fname0, int k, int cutoff){
65 final String fname=fname0+"_"+(new Random().nextLong()>>>1)+".fa.gz";
66 assert(!new File(fname).exists());
67
68 ArrayList<String> arglist=new ArrayList<String>();
69 arglist.add("in="+fname0);
70 arglist.add("out="+fname);
71 arglist.add("k="+k);
72 String[] args=arglist.toArray(new String[0]);
73
74 main(args);
75
76 assert(false) : fname+", "+k+", "+cutoff;
77 KmerTableSet set=getValidKmers(fname, k, cutoff);
78 File f=new File(fname);
79 if(f.exists()){f.delete();}
80
81 return set;
82 }
83
84 /**
85 * @param fname Input filename of pivot kmers
86 * @param k Kmer length
87 * @param cutoff Minimum count to retain
88 * @return Set of pivot kmers
89 */
90 public static KmerTableSet getValidKmers(final String fname, int k, int cutoff){
91 ArrayList<String> arglist=new ArrayList<String>();
92 arglist.add("in="+fname);
93 arglist.add("k="+k);
94 if(cutoff>1 && prefilter){
95 arglist.add("prefilter="+(cutoff-1));
96 }
97
98 String[] args=arglist.toArray(new String[0]);
99 KmerTableSet set=new KmerTableSet(args, 12);
100
101 Timer t=new Timer();
102
103 set.process(t);
104 // errorState|=set.errorState;
105 assert(!set.errorState);
106 t.stop();
107
108 set.prefilterArray=null;
109 AbstractRemoveThread.process(Shared.threads(), cutoff, Integer.MAX_VALUE, set, true);
110
111 return set;
112 }
113
114 /*--------------------------------------------------------------*/
115 /*---------------- Initialization ----------------*/
116 /*--------------------------------------------------------------*/
117
118 /**
119 * Constructor.
120 * @param args Command line arguments
121 */
122 public KmerReduce(String[] args){
123
124 {//Preparse block for help, config files, and outstream
125 PreParser pp=new PreParser(args, getClass(), false);
126 args=pp.args;
127 outstream=pp.outstream;
128 }
129
130 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
131 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
132
133 Parser parser=new Parser();
134 for(int i=0; i<args.length; i++){
135 String arg=args[i];
136 String[] split=arg.split("=");
137 String a=split[0].toLowerCase();
138 String b=split.length>1 ? split[1] : null;
139
140 if(parser.parse(arg, a, b)){
141 //do nothing
142 }else if(a.equals("verbose")){
143 verbose=Parse.parseBoolean(b);
144 }else if(a.equals("parse_flag_goes_here")){
145 //Set a variable here
146 }else if(a.equals("k")){
147 k=Integer.parseInt(b);
148 assert(k>0 && k<32);
149 }else if(a.equals("comparisons") || a.equals("c")){
150 //do nothing
151 }else if(a.equals("ecco")){
152 ecco=Parse.parseBoolean(b);
153 }else if(a.equals("rename") || a.equals("addname")){
154 //do nothing
155 }else if(a.equals("rcomp") || a.equals("reversecomplement")){
156 //do nothing
157 }else if(a.equals("condense") || a.equals("consensus")){
158 //do nothing
159 }else if(a.equals("correct") || a.equals("ecc")){
160 //do nothing
161 }else if(a.equals("groups") || a.equals("g") || a.equals("sets") || a.equals("ways")){
162 //do nothing
163 }else if(a.equals("seed")){
164 KmerComparator.defaultSeed=Long.parseLong(b);
165 }else if(a.equals("hashes")){
166 KmerComparator.setHashes(Integer.parseInt(b));
167 }else{
168 outstream.println("Unknown parameter "+args[i]);
169 assert(false) : "Unknown parameter "+args[i];
170 // throw new RuntimeException("Unknown parameter "+args[i]);
171 }
172 }
173
174 {//Process parser fields
175 Parser.processQuality();
176
177 maxReads=parser.maxReads;
178
179 overwrite=ReadStats.overwrite=parser.overwrite;
180 append=ReadStats.append=parser.append;
181
182 in1=parser.in1;
183 in2=parser.in2;
184
185 out1=parser.out1;
186
187 extin=parser.extin;
188 extout=parser.extout;
189 }
190
191 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
192 in2=in1.replace("#", "2");
193 in1=in1.replace("#", "1");
194 }
195 if(in2!=null){
196 if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
197 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
198 }
199
200 assert(FastaReadInputStream.settingsOK());
201
202 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
203
204 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
205 ByteFile.FORCE_MODE_BF2=true;
206 }
207
208 if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;}
209
210 if(!Tools.testOutputFiles(overwrite, append, false, out1)){
211 outstream.println((out1==null)+", "+out1);
212 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out1+"\n");
213 }
214
215 ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, false);
216
217 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
218 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
219 }
220
221
222 /*--------------------------------------------------------------*/
223 /*---------------- Outer Methods ----------------*/
224 /*--------------------------------------------------------------*/
225
226 /** Create read streams and process all data */
227 void process(Timer t){
228
229 final ConcurrentReadInputStream cris;
230 {
231 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, null, null);
232 cris.start();
233 if(verbose){outstream.println("Started cris");}
234 }
235 boolean paired=cris.paired();
236
237 final ConcurrentReadOutputStream ros;
238 if(out1!=null){
239 final int buff=Tools.max(4, Shared.threads());
240
241 assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name.";
242
243 ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false);
244 ros.start();
245 }else{ros=null;}
246
247 readsProcessed=0;
248 basesProcessed=0;
249
250 //Process the read stream
251 processInner(cris, ros);
252
253 if(verbose){outstream.println("Finished; closing streams.");}
254
255 errorState|=ReadStats.writeAll();
256 errorState|=ReadWrite.closeStreams(cris, ros);
257
258 t.stop();
259
260 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
261
262 if(errorState){
263 Clumpify.sharedErrorState=true;
264 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
265 }
266 }
267
268 /** Manage threads */
269 public void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
270 if(verbose){outstream.println("Making comparator.");}
271 KmerComparator kc=new KmerComparator(k, false, false);
272
273 if(verbose){outstream.println("Making hash threads.");}
274 final int threads=Shared.threads();
275 ArrayList<HashThread> alht=new ArrayList<HashThread>(threads);
276 for(int i=0; i<threads; i++){alht.add(new HashThread(cris, ros, kc));}
277
278 if(verbose){outstream.println("Starting threads.");}
279 for(HashThread ht : alht){ht.start();}
280
281 if(verbose){outstream.println("Waiting for threads.");}
282 /* Wait for threads to die */
283 for(HashThread ht : alht){
284
285 /* Wait for a thread to die */
286 while(ht.getState()!=Thread.State.TERMINATED){
287 try {
288 ht.join();
289 } catch (InterruptedException e) {
290 e.printStackTrace();
291 }
292 }
293 readsProcessed+=ht.readsProcessedT;
294 basesProcessed+=ht.basesProcessedT;
295 }
296 }
297
298 /*--------------------------------------------------------------*/
299 /*---------------- Inner Classes ----------------*/
300 /*--------------------------------------------------------------*/
301
302 private class HashThread extends Thread{
303
304 HashThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, KmerComparator kc_){
305 cris=cris_;
306 ros=ros_;
307 kc=kc_;
308 }
309
310 @Override
311 public void run(){
312
313 ListNum<Read> ln=cris.nextList();
314 ArrayList<Read> reads=(ln!=null ? ln.list : null);
315
316 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
317 ArrayList<Read> out=new ArrayList<Read>(reads.size());
318 for(Read r : reads){
319 if(ecco && r.mate!=null){
320 if(r.mate!=null){BBMerge.findOverlapStrict(r, r.mate, true);}
321 }
322 final long kmer=kc.hash(r, null, 0, false);
323 readsProcessedT++;
324 basesProcessedT+=r.length();
325 if(kmer>=0){
326 Read temp=new Read(toBytes(kmer), null, header, r.numericID);
327 out.add(temp);
328 }
329 }
330 if(ros!=null){ros.add(out, ln.id);}
331 cris.returnList(ln);
332 ln=cris.nextList();
333 reads=(ln!=null ? ln.list : null);
334 }
335 if(ln!=null){
336 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
337 }
338 }
339
340 final ConcurrentReadInputStream cris;
341 final ConcurrentReadOutputStream ros;
342 final KmerComparator kc;
343
344 protected long readsProcessedT=0;
345 protected long basesProcessedT=0;
346
347 private static final String header="1";
348 }
349
350 /*--------------------------------------------------------------*/
351 /*---------------- Inner Methods ----------------*/
352 /*--------------------------------------------------------------*/
353
354 public byte[] toBytes(final long kmer){
355 byte[] dest=KillSwitch.allocByte1D(k);
356 fill(kmer, dest, 0);
357 return dest;
358 }
359
360 public void fill(final long kmer, final byte[] dest, int pos){
361 for(int i=k-1; i>=0; i--, pos++){
362 int x=(int)((kmer>>(2*i))&3);
363 dest[pos]=AminoAcid.numberToBase[x];
364 }
365 }
366
367 /*--------------------------------------------------------------*/
368 /*---------------- Fields ----------------*/
369 /*--------------------------------------------------------------*/
370
371 private int k=31;
372 static boolean prefilter=true;
373
374 /*--------------------------------------------------------------*/
375 /*---------------- I/O Fields ----------------*/
376 /*--------------------------------------------------------------*/
377
378 private String in1=null;
379 private String in2=null;
380
381 private String out1=null;
382
383 private String extin=null;
384 private String extout=null;
385
386 /*--------------------------------------------------------------*/
387
388 protected long readsProcessed=0;
389 protected long basesProcessed=0;
390
391 private long maxReads=-1;
392 protected boolean ecco=false;
393
394 /*--------------------------------------------------------------*/
395 /*---------------- Final Fields ----------------*/
396 /*--------------------------------------------------------------*/
397
398 private final FileFormat ffin1;
399 private final FileFormat ffin2;
400
401 private final FileFormat ffout1;
402
403 /*--------------------------------------------------------------*/
404 /*---------------- Common Fields ----------------*/
405 /*--------------------------------------------------------------*/
406
407 private PrintStream outstream=System.err;
408 public static boolean verbose=false;
409 public boolean errorState=false;
410 private boolean overwrite=false;
411 private boolean append=false;
412
413 }