comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/AddSSU.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package sketch;
2
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6
7 import fileIO.ByteFile;
8 import fileIO.ByteStreamWriter;
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import shared.Parse;
12 import shared.Parser;
13 import shared.PreParser;
14 import shared.Shared;
15 import shared.Timer;
16 import shared.Tools;
17 import structures.ByteBuilder;
18 import tax.TaxTree;
19
20 /**
21 * @author Brian Bushnell
22 * @date May 9, 2016
23 *
24 */
25 public class AddSSU {
26
27 /*--------------------------------------------------------------*/
28 /*---------------- Initialization ----------------*/
29 /*--------------------------------------------------------------*/
30
31 /**
32 * Code entrance from the command line.
33 * @param args Command line arguments
34 */
35 public static void main(String[] args){
36 //Start a timer immediately upon code entrance.
37 Timer t=new Timer();
38
39 //Create an instance of this class
40 AddSSU x=new AddSSU(args);
41
42 //Run the object
43 x.process(t);
44
45 //Close the print stream if it was redirected
46 Shared.closeStream(x.outstream);
47 }
48
49 /**
50 * Constructor.
51 * @param args Command line arguments
52 */
53 public AddSSU(String[] args){
54
55 {//Preparse block for help, config files, and outstream
56 PreParser pp=new PreParser(args, /*getClass()*/null, false);
57 args=pp.args;
58 outstream=pp.outstream;
59 }
60
61 //Set shared static variables prior to parsing
62 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
63 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
64
65 {//Parse the arguments
66 final Parser parser=parse(args);
67 overwrite=parser.overwrite;
68 append=parser.append;
69
70 in1=parser.in1;
71
72 out1=parser.out1;
73 }
74
75 fixExtensions(); //Add or remove .gz or .bz2 as needed
76 checkFileExistence(); //Ensure files can be read and written
77 checkStatics(); //Adjust file-related static fields as needed for this program
78
79 ffout1=FileFormat.testOutput(out1, FileFormat.SKETCH, null, true, overwrite, append, false);
80 ffin1=FileFormat.testInput(in1, FileFormat.SKETCH, null, true, false);
81
82 if(verbose){
83 System.err.println("Set r16SFile="+r16SFile);
84 System.err.println("Set r18SFile="+r18SFile);
85 }
86
87 tree=(treeFile!=null && (preferSSUMapEuks || preferSSUMapProks || clear16SEuks || clear18SEuks ||
88 clear16SProks || clear18SProks || useSSUMapOnlyEuks || useSSUMapOnlyProks) ? TaxTree.loadTaxTree(treeFile, outstream, false, false) : null);
89
90 if(preferSSUMapEuks || preferSSUMapProks || clear16SEuks || clear18SEuks || clear16SProks || clear18SProks || useSSUMapOnlyEuks || useSSUMapOnlyProks){
91 assert(tree!=null) : "preferSSUMapForEuks, clear16SEuks, and clear18SEuks require a TaxTree.";
92 }
93 }
94
95 /*--------------------------------------------------------------*/
96 /*---------------- Initialization Helpers ----------------*/
97 /*--------------------------------------------------------------*/
98
99 /** Parse arguments from the command line */
100 private Parser parse(String[] args){
101
102 Parser parser=new Parser();
103 for(int i=0; i<args.length; i++){
104 String arg=args[i];
105 String[] split=arg.split("=");
106 String a=split[0].toLowerCase();
107 String b=split.length>1 ? split[1] : null;
108 if(b!=null && b.equalsIgnoreCase("null")){b=null;}
109
110 if(a.equalsIgnoreCase("16S") || a.equalsIgnoreCase("16Sfile")){
111 r16SFile=b;
112 }else if(a.equalsIgnoreCase("18S") || a.equalsIgnoreCase("18Sfile")){
113 r18SFile=b;
114 }else if(a.equalsIgnoreCase("tree") || a.equalsIgnoreCase("treefile")){
115 treeFile=b;
116 }else if(a.equals("lines")){
117 maxLines=Long.parseLong(b);
118 if(maxLines<0){maxLines=Long.MAX_VALUE;}
119 }else if(a.equals("verbose")){
120 verbose=Parse.parseBoolean(b);
121 // ByteFile1.verbose=verbose;
122 // ByteFile2.verbose=verbose;
123 // ReadWrite.verbose=verbose;
124 }
125
126 else if(a.equalsIgnoreCase("preferSSUMap")){
127 preferSSUMap=Parse.parseBoolean(b);
128 }else if(a.equalsIgnoreCase("preferSSUMapForEuks") || a.equalsIgnoreCase("preferSSUMapEuks")){
129 preferSSUMapEuks=Parse.parseBoolean(b);
130 }else if(a.equalsIgnoreCase("useSSUMapOnly")){
131 useSSUMapOnly=Parse.parseBoolean(b);
132 }else if(a.equalsIgnoreCase("useSSUMapOnlyEuks") || a.equalsIgnoreCase("SSUMapOnlyEuks")){
133 useSSUMapOnlyEuks=Parse.parseBoolean(b);
134 }else if(a.equalsIgnoreCase("useSSUMapOnlyProks") || a.equalsIgnoreCase("SSUMapOnlyProks")){
135 useSSUMapOnlyProks=Parse.parseBoolean(b);
136 }else if(a.equalsIgnoreCase("preferSSUMapForProks") || a.equalsIgnoreCase("preferSSUMapProks")){
137 preferSSUMapProks=Parse.parseBoolean(b);
138 }
139
140 else if(a.equalsIgnoreCase("clearAll")){
141 clear16S=clear18S=Parse.parseBoolean(b);
142 }else if(a.equalsIgnoreCase("clear16S")){
143 clear16S=Parse.parseBoolean(b);
144 }else if(a.equalsIgnoreCase("clear18S")){
145 clear18S=Parse.parseBoolean(b);
146 }else if(a.equalsIgnoreCase("clear16SEuks")){
147 clear16SEuks=Parse.parseBoolean(b);
148 }else if(a.equalsIgnoreCase("clear18SEuks")){
149 clear18SEuks=Parse.parseBoolean(b);
150 }else if(a.equalsIgnoreCase("clear16SProks")){
151 clear16SProks=Parse.parseBoolean(b);
152 }else if(a.equalsIgnoreCase("clear18SProks")){
153 clear18SProks=Parse.parseBoolean(b);
154 }
155
156 else if(parser.parse(arg, a, b)){
157 //do nothing
158 }else{
159 outstream.println("Unknown parameter "+args[i]);
160 assert(false) : "Unknown parameter "+args[i];
161 // throw new RuntimeException("Unknown parameter "+args[i]);
162 }
163 }
164 if("auto".equalsIgnoreCase(r16SFile)){r16SFile=TaxTree.default16SFile();}
165 if("auto".equalsIgnoreCase(r18SFile)){r18SFile=TaxTree.default18SFile();}
166 SSUMap.r16SFile=r16SFile;
167 SSUMap.r18SFile=r18SFile;
168
169 return parser;
170 }
171
172 /** Add or remove .gz or .bz2 as needed */
173 private void fixExtensions(){
174 in1=Tools.fixExtension(in1);
175 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
176 }
177
178 /** Ensure files can be read and written */
179 private void checkFileExistence(){
180 //Ensure output files can be written
181 if(!Tools.testOutputFiles(overwrite, append, false, out1)){
182 outstream.println((out1==null)+", "+out1);
183 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
184 }
185
186 //Ensure input files can be read
187 if(!Tools.testInputFiles(false, true, in1, r16SFile, r18SFile)){
188 throw new RuntimeException("\nCan't read some input files.\n");
189 }
190 assert(in1!=null) : "Input sketch file is required";
191 assert(r16SFile!=null || r18SFile!=null) : "Input SSU file is required";
192
193 //Ensure that no file was specified multiple times
194 if(!Tools.testForDuplicateFiles(true, in1, out1, r16SFile, r18SFile)){
195 throw new RuntimeException("\nSome file names were specified multiple times.\n");
196 }
197 }
198
199 /** Adjust file-related static fields as needed for this program */
200 private static void checkStatics(){
201 //Adjust the number of threads for input file reading
202 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
203 ByteFile.FORCE_MODE_BF2=true;
204 }
205
206 // if(!ByteFile.FORCE_MODE_BF2){
207 // ByteFile.FORCE_MODE_BF2=false;
208 // ByteFile.FORCE_MODE_BF1=true;
209 // }
210 }
211
212 /*--------------------------------------------------------------*/
213 /*---------------- Outer Methods ----------------*/
214 /*--------------------------------------------------------------*/
215
216 void process(Timer t){
217
218 ByteFile bf=ByteFile.makeByteFile(ffin1);
219 ByteStreamWriter bsw=makeBSW(ffout1);
220
221 processInner(bf, bsw);
222
223 errorState|=bf.close();
224 if(bsw!=null){errorState|=bsw.poisonAndWait();}
225
226 t.stop();
227
228 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8));
229 outstream.println(Tools.linesBytesOut(linesProcessed, bytesProcessed, linesOut, bytesOut, 8, true));
230
231 outstream.println();
232 outstream.println(Tools.number("Sketches:", sketchCount, 8));
233 outstream.println(Tools.number("16S In:", r16Sin, 8));
234 outstream.println(Tools.number("18S In:", r18Sin, 8));
235 outstream.println(Tools.number("16S Added:", r16SfromMap, 8));
236 outstream.println(Tools.number("18S Added:", r18SfromMap, 8));
237 outstream.println(Tools.numberPercent("16S Out:", r16Sout, r16Sout*100.0/sketchCount, 2, 8));
238 outstream.println(Tools.numberPercent("18S Out:", r18Sout, r18Sout*100.0/sketchCount, 2, 8));
239
240 if(errorState){
241 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
242 }
243 }
244
245 /*--------------------------------------------------------------*/
246 /*---------------- Inner Methods ----------------*/
247 /*--------------------------------------------------------------*/
248
249 private static ByteStreamWriter makeBSW(FileFormat ff){
250 if(ff==null){return null;}
251 ByteStreamWriter bsw=new ByteStreamWriter(ff);
252 bsw.start();
253 return bsw;
254 }
255
256 // private void processInner_old(ByteFile bf, ByteStreamWriter bsw){
257 // SSUMap.load(outstream);
258 //
259 // if(verbose){
260 // System.err.println("Loaded SSUMap; |16S|="+SSUMap.r16SCount()+", |18S|="+SSUMap.r18SCount());
261 // }
262 //
263 // byte[] line=bf.nextLine();
264 //// ByteBuilder bb=new ByteBuilder();
265 //
266 // final byte[] ssuBytes="SSU:".getBytes();
267 // final byte[] r16SBytes="16S:".getBytes();
268 // final byte[] r18SBytes="18S:".getBytes();
269 //
270 // while(line!=null){
271 // if(line.length>0){
272 // if(maxLines>0 && linesProcessed>=maxLines){break;}
273 // linesProcessed++;
274 // bytesProcessed+=(line.length+1);
275 //
276 // final boolean header=(line[0]=='#');
277 //
278 // linesOut++;
279 // bytesOut+=(line.length+1);
280 //
281 // if(header){
282 // if(Tools.startsWith(line, "#SZ:")){
283 // sketchCount++;
284 //
285 // bsw.print(line);
286 //
287 // final int tid=parseTaxID(line);
288 // final boolean has16S=Tools.contains(line, ssuBytes, 0) || Tools.contains(line, r16SBytes, 0);
289 // final boolean has18S=Tools.contains(line, r18SBytes, 0);
290 //
291 // if(verbose){
292 // System.err.println("For line "+new String(line)+":");
293 // System.err.println("tid="+tid+", has16S="+has16S+", has18S="+has18S);
294 // }
295 //
296 // if(tid>0){
297 // final byte[] r16S=has16S ? null : SSUMap.r16SMap.get(tid);
298 // final byte[] r18S=has18S ? null : SSUMap.r18SMap.get(tid);
299 // if(r16S!=null){bsw.print("\t16S:").print(r16S.length); ssuOut++;}
300 // if(r18S!=null){bsw.print("\t18S:").print(r18S.length); ssuOut++;}
301 // if(r16S!=null){bsw.print("\n#16S:").print(r16S);}
302 // if(r18S!=null){bsw.print("\n#18S:").print(r18S);}
303 //
304 // if(verbose){System.err.println("Found 16S: "+(r16S!=null)+"; found 18S: "+(r18S!=null));}
305 // }
306 // bsw.println();
307 // }else if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#18S:") || Tools.startsWith(line, "#SSU:")){
308 // bsw.println(line);
309 // ssuIn++;
310 // ssuOut++;
311 // }else{
312 // assert(Tools.startsWith(line, "##")) : new String(line);
313 // bsw.println(line);
314 // }
315 // }else{
316 // bsw.println(line);
317 // }
318 // }
319 // line=bf.nextLine();
320 // }
321 // }
322
323 private void processInner(ByteFile bf, ByteStreamWriter bsw){
324 SSUMap.load(outstream);
325
326 if(verbose){
327 System.err.println("Loaded SSUMap; |16S|="+SSUMap.r16SCount()+", |18S|="+SSUMap.r18SCount());
328 }
329
330 byte[] line=bf.nextLine();
331 // ByteBuilder bb=new ByteBuilder();
332
333 // final byte[] ssuBytes="SSU:".getBytes();
334 // final byte[] r16SBytes="16S:".getBytes();
335 // final byte[] r18SBytes="18S:".getBytes();
336
337 SketchHeader header=null;
338 while(line!=null){
339 if(line.length>0){
340 if(maxLines>0 && linesProcessed>=maxLines){break;}
341 linesProcessed++;
342 bytesProcessed+=(line.length+1);
343
344 final boolean isHeader=(line[0]=='#');
345
346 if(isHeader){
347 if(Tools.startsWith(line, "#SZ:")){
348 assert(header==null) : "\nReplacing this:\n"+header.toBytes()+"\nWith this:\n"+new String(line)+"\n";
349 header=new SketchHeader(line);
350 sketchCount++;
351 }else if(Tools.startsWith(line, "##")){
352 bsw.println(line);
353
354 linesOut++;
355 bytesOut+=(line.length+1);
356 }else{
357 header.addLine(line);
358 }
359 }else{
360 if(header!=null){
361 try {
362 processHeader(header);
363 } catch (Throwable e) {
364 e.printStackTrace();
365 assert(false) : header.toBytes();
366 }
367 r16Sout+=(header.r16S==null ? 0 : 1);
368 r18Sout+=(header.r18S==null ? 0 : 1);
369 linesOut+=1+(header.r16S==null ? 0 : 1)+(header.r18S==null ? 0 : 1);
370 ByteBuilder bb=header.toBytes();
371 bytesOut+=(bb.length+1);
372 bsw.println(bb);
373 header=null;
374 }
375 bsw.println(line);
376
377 linesOut++;
378 bytesOut+=(line.length+1);
379 }
380 }
381 line=bf.nextLine();
382 }
383 }
384
385 void processHeader(SketchHeader header){
386
387 if(verbose){System.err.println("Processing tid "+header.tid+":\n"+header.toBytes()+"\n");}
388
389 final boolean euk=(tree!=null && header.tid>0 && header.tid<SketchObject.minFakeID) ? tree.isEukaryote(header.tid) : false;
390 final boolean prok=(tree!=null && header.tid>0 && header.tid<SketchObject.minFakeID) ? tree.isProkaryote(header.tid) : false;
391 if(useSSUMapOnly || (useSSUMapOnlyEuks && euk) || (useSSUMapOnlyProks && prok)){header.r16S=header.r18S=null;}
392 if(header.tid>0){
393 final boolean preferMap=(preferSSUMap || (preferSSUMapEuks && euk) || (preferSSUMapProks && prok));
394 byte[] r16S=(SSUMap.r16SMap==null ? null : SSUMap.r16SMap.get(header.tid));
395 byte[] r18S=(SSUMap.r18SMap==null ? null : SSUMap.r18SMap.get(header.tid));
396 if(r16S!=null && (preferMap || header.r16S==null)){
397 header.r16S=r16S;
398 r16SfromMap++;
399 }
400 if(r18S!=null && (preferMap || header.r18S==null)){
401 header.r18S=r18S;
402 r18SfromMap++;
403 }
404 }
405 if(clear16S || (clear16SEuks && euk) || (clear16SProks && prok)){header.r16S=null;}
406 if(clear18S || (clear18SEuks && euk) || (clear18SProks && prok)){header.r18S=null;}
407 }
408
409 int parseTaxID(byte[] line){
410 String[] split=Tools.tabPattern.split(new String(line));
411 for(String s : split){
412 if(s.startsWith("ID:") || s.startsWith("TAXID:")){
413 final int colon=s.indexOf(':');
414 final String sub=s.substring(colon+1);
415 return Integer.parseInt(sub);
416 }
417 }
418 return -1;
419 }
420
421 /*--------------------------------------------------------------*/
422
423 //A very limited parser
424 private class SketchHeader {
425
426 SketchHeader(byte[] line){
427 this(new String(line, 1, line.length-1));
428 }
429
430 SketchHeader(String line){
431 if(line.charAt(0)=='#'){line=line.substring(1);}
432 assert(line.startsWith("SZ:"));
433 String[] split=Tools.tabPattern.split(line);
434 fields=new ArrayList<String>(line.length()+2);
435 int tid_=-1;
436 for(String s : split){
437 if(s.startsWith("16S:") || s.startsWith("18S:") || s.startsWith("SSU:")){
438 //do nothing
439 }else{
440 if(s.startsWith("ID:") || s.startsWith("TAXID:")){
441 final int colon=s.indexOf(':');
442 final String sub=s.substring(colon+1);
443 tid_=Integer.parseInt(sub);
444 }
445 fields.add(s);
446 }
447 }
448 tid=tid_;
449 }
450
451 void addLine(byte[] line){
452 assert(line[0]=='#');
453 assert(line[1]=='1' || line[1]=='S') : new String(line);
454 if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){
455 assert(r16S==null);
456 r16S=Arrays.copyOfRange(line, 5, line.length);
457 r16Sin++;
458 }else if(Tools.startsWith(line, "#18S:")){
459 assert(r18S==null);
460 r18S=Arrays.copyOfRange(line, 5, line.length);
461 r18Sin++;
462 }else{
463 assert(false) : new String(line);
464 }
465 }
466
467 ByteBuilder toBytes(){
468 ByteBuilder bb=new ByteBuilder(1000);
469 bb.append('#');
470 for(int i=0; i<fields.size(); i++){
471 if(i>0){bb.tab();}
472 bb.append(fields.get(i));
473 }
474 if(r16S!=null){bb.tab().append("16S:").append(r16S.length);}
475 if(r18S!=null){bb.tab().append("18S:").append(r18S.length);}
476
477 if(r16S!=null){bb.nl().append("#16S:").append(r16S);}
478 if(r18S!=null){bb.nl().append("#18S:").append(r18S);}
479 return bb;
480 }
481
482 final int tid;
483 ArrayList<String> fields;
484 byte[] r16S;
485 byte[] r18S;
486 }
487
488 /*--------------------------------------------------------------*/
489 /*---------------- Fields ----------------*/
490 /*--------------------------------------------------------------*/
491
492 private String in1=null;
493 private String out1=null;
494 private String r16SFile="auto";
495 private String r18SFile="auto";
496 private String treeFile="auto";
497
498 boolean preferSSUMap=false;
499 boolean preferSSUMapEuks=false;
500 boolean preferSSUMapProks=false;
501 boolean useSSUMapOnly=false;
502 boolean useSSUMapOnlyEuks=false;
503 boolean useSSUMapOnlyProks=false;
504 boolean clear16S=false;
505 boolean clear18S=false;
506 boolean clear16SEuks=false;
507 boolean clear18SEuks=false;
508 boolean clear16SProks=false;
509 boolean clear18SProks=false;
510
511 /*--------------------------------------------------------------*/
512
513 private long linesProcessed=0;
514 private long linesOut=0;
515 private long bytesProcessed=0;
516 private long bytesOut=0;
517
518 private long sketchCount=0;
519
520 private long r16Sin=0;
521 private long r16Sout=0;
522 private long r16SfromMap=0;
523 private long r18Sin=0;
524 private long r18Sout=0;
525 private long r18SfromMap=0;
526
527 private long maxLines=Long.MAX_VALUE;
528
529 /*--------------------------------------------------------------*/
530 /*---------------- Final Fields ----------------*/
531 /*--------------------------------------------------------------*/
532
533 private final FileFormat ffin1;
534 private final FileFormat ffout1;
535
536 private final TaxTree tree;
537
538 /*--------------------------------------------------------------*/
539 /*---------------- Common Fields ----------------*/
540 /*--------------------------------------------------------------*/
541
542 private PrintStream outstream=System.err;
543 public static boolean verbose=false;
544 public boolean errorState=false;
545 private boolean overwrite=false;
546 private boolean append=false;
547
548 }