Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/AddSSU.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/sketch/AddSSU.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,548 @@ +package sketch; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import fileIO.ByteFile; +import fileIO.ByteStreamWriter; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import structures.ByteBuilder; +import tax.TaxTree; + +/** + * @author Brian Bushnell + * @date May 9, 2016 + * + */ +public class AddSSU { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + //Start a timer immediately upon code entrance. + Timer t=new Timer(); + + //Create an instance of this class + AddSSU x=new AddSSU(args); + + //Run the object + x.process(t); + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + /** + * Constructor. + * @param args Command line arguments + */ + public AddSSU(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, /*getClass()*/null, false); + args=pp.args; + outstream=pp.outstream; + } + + //Set shared static variables prior to parsing + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); + + {//Parse the arguments + final Parser parser=parse(args); + overwrite=parser.overwrite; + append=parser.append; + + in1=parser.in1; + + out1=parser.out1; + } + + fixExtensions(); //Add or remove .gz or .bz2 as needed + checkFileExistence(); //Ensure files can be read and written + checkStatics(); //Adjust file-related static fields as needed for this program + + ffout1=FileFormat.testOutput(out1, FileFormat.SKETCH, null, true, overwrite, append, false); + ffin1=FileFormat.testInput(in1, FileFormat.SKETCH, null, true, false); + + if(verbose){ + System.err.println("Set r16SFile="+r16SFile); + System.err.println("Set r18SFile="+r18SFile); + } + + tree=(treeFile!=null && (preferSSUMapEuks || preferSSUMapProks || clear16SEuks || clear18SEuks || + clear16SProks || clear18SProks || useSSUMapOnlyEuks || useSSUMapOnlyProks) ? TaxTree.loadTaxTree(treeFile, outstream, false, false) : null); + + if(preferSSUMapEuks || preferSSUMapProks || clear16SEuks || clear18SEuks || clear16SProks || clear18SProks || useSSUMapOnlyEuks || useSSUMapOnlyProks){ + assert(tree!=null) : "preferSSUMapForEuks, clear16SEuks, and clear18SEuks require a TaxTree."; + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Initialization Helpers ----------------*/ + /*--------------------------------------------------------------*/ + + /** Parse arguments from the command line */ + private Parser parse(String[] args){ + + Parser parser=new Parser(); + for(int i=0; i<args.length; i++){ + String arg=args[i]; + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + if(b!=null && b.equalsIgnoreCase("null")){b=null;} + + if(a.equalsIgnoreCase("16S") || a.equalsIgnoreCase("16Sfile")){ + r16SFile=b; + }else if(a.equalsIgnoreCase("18S") || a.equalsIgnoreCase("18Sfile")){ + r18SFile=b; + }else if(a.equalsIgnoreCase("tree") || a.equalsIgnoreCase("treefile")){ + treeFile=b; + }else if(a.equals("lines")){ + maxLines=Long.parseLong(b); + if(maxLines<0){maxLines=Long.MAX_VALUE;} + }else if(a.equals("verbose")){ + verbose=Parse.parseBoolean(b); +// ByteFile1.verbose=verbose; +// ByteFile2.verbose=verbose; +// ReadWrite.verbose=verbose; + } + + else if(a.equalsIgnoreCase("preferSSUMap")){ + preferSSUMap=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("preferSSUMapForEuks") || a.equalsIgnoreCase("preferSSUMapEuks")){ + preferSSUMapEuks=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("useSSUMapOnly")){ + useSSUMapOnly=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("useSSUMapOnlyEuks") || a.equalsIgnoreCase("SSUMapOnlyEuks")){ + useSSUMapOnlyEuks=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("useSSUMapOnlyProks") || a.equalsIgnoreCase("SSUMapOnlyProks")){ + useSSUMapOnlyProks=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("preferSSUMapForProks") || a.equalsIgnoreCase("preferSSUMapProks")){ + preferSSUMapProks=Parse.parseBoolean(b); + } + + else if(a.equalsIgnoreCase("clearAll")){ + clear16S=clear18S=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("clear16S")){ + clear16S=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("clear18S")){ + clear18S=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("clear16SEuks")){ + clear16SEuks=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("clear18SEuks")){ + clear18SEuks=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("clear16SProks")){ + clear16SProks=Parse.parseBoolean(b); + }else if(a.equalsIgnoreCase("clear18SProks")){ + clear18SProks=Parse.parseBoolean(b); + } + + else if(parser.parse(arg, a, b)){ + //do nothing + }else{ + outstream.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + if("auto".equalsIgnoreCase(r16SFile)){r16SFile=TaxTree.default16SFile();} + if("auto".equalsIgnoreCase(r18SFile)){r18SFile=TaxTree.default18SFile();} + SSUMap.r16SFile=r16SFile; + SSUMap.r18SFile=r18SFile; + + return parser; + } + + /** Add or remove .gz or .bz2 as needed */ + private void fixExtensions(){ + in1=Tools.fixExtension(in1); + if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} + } + + /** Ensure files can be read and written */ + private void checkFileExistence(){ + //Ensure output files can be written + if(!Tools.testOutputFiles(overwrite, append, false, out1)){ + outstream.println((out1==null)+", "+out1); + throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n"); + } + + //Ensure input files can be read + if(!Tools.testInputFiles(false, true, in1, r16SFile, r18SFile)){ + throw new RuntimeException("\nCan't read some input files.\n"); + } + assert(in1!=null) : "Input sketch file is required"; + assert(r16SFile!=null || r18SFile!=null) : "Input SSU file is required"; + + //Ensure that no file was specified multiple times + if(!Tools.testForDuplicateFiles(true, in1, out1, r16SFile, r18SFile)){ + throw new RuntimeException("\nSome file names were specified multiple times.\n"); + } + } + + /** Adjust file-related static fields as needed for this program */ + private static void checkStatics(){ + //Adjust the number of threads for input file reading + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ + ByteFile.FORCE_MODE_BF2=true; + } + +// if(!ByteFile.FORCE_MODE_BF2){ +// ByteFile.FORCE_MODE_BF2=false; +// ByteFile.FORCE_MODE_BF1=true; +// } + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + void process(Timer t){ + + ByteFile bf=ByteFile.makeByteFile(ffin1); + ByteStreamWriter bsw=makeBSW(ffout1); + + processInner(bf, bsw); + + errorState|=bf.close(); + if(bsw!=null){errorState|=bsw.poisonAndWait();} + + t.stop(); + + outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8)); + outstream.println(Tools.linesBytesOut(linesProcessed, bytesProcessed, linesOut, bytesOut, 8, true)); + + outstream.println(); + outstream.println(Tools.number("Sketches:", sketchCount, 8)); + outstream.println(Tools.number("16S In:", r16Sin, 8)); + outstream.println(Tools.number("18S In:", r18Sin, 8)); + outstream.println(Tools.number("16S Added:", r16SfromMap, 8)); + outstream.println(Tools.number("18S Added:", r18SfromMap, 8)); + outstream.println(Tools.numberPercent("16S Out:", r16Sout, r16Sout*100.0/sketchCount, 2, 8)); + outstream.println(Tools.numberPercent("18S Out:", r18Sout, r18Sout*100.0/sketchCount, 2, 8)); + + if(errorState){ + throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + private static ByteStreamWriter makeBSW(FileFormat ff){ + if(ff==null){return null;} + ByteStreamWriter bsw=new ByteStreamWriter(ff); + bsw.start(); + return bsw; + } + +// private void processInner_old(ByteFile bf, ByteStreamWriter bsw){ +// SSUMap.load(outstream); +// +// if(verbose){ +// System.err.println("Loaded SSUMap; |16S|="+SSUMap.r16SCount()+", |18S|="+SSUMap.r18SCount()); +// } +// +// byte[] line=bf.nextLine(); +//// ByteBuilder bb=new ByteBuilder(); +// +// final byte[] ssuBytes="SSU:".getBytes(); +// final byte[] r16SBytes="16S:".getBytes(); +// final byte[] r18SBytes="18S:".getBytes(); +// +// while(line!=null){ +// if(line.length>0){ +// if(maxLines>0 && linesProcessed>=maxLines){break;} +// linesProcessed++; +// bytesProcessed+=(line.length+1); +// +// final boolean header=(line[0]=='#'); +// +// linesOut++; +// bytesOut+=(line.length+1); +// +// if(header){ +// if(Tools.startsWith(line, "#SZ:")){ +// sketchCount++; +// +// bsw.print(line); +// +// final int tid=parseTaxID(line); +// final boolean has16S=Tools.contains(line, ssuBytes, 0) || Tools.contains(line, r16SBytes, 0); +// final boolean has18S=Tools.contains(line, r18SBytes, 0); +// +// if(verbose){ +// System.err.println("For line "+new String(line)+":"); +// System.err.println("tid="+tid+", has16S="+has16S+", has18S="+has18S); +// } +// +// if(tid>0){ +// final byte[] r16S=has16S ? null : SSUMap.r16SMap.get(tid); +// final byte[] r18S=has18S ? null : SSUMap.r18SMap.get(tid); +// if(r16S!=null){bsw.print("\t16S:").print(r16S.length); ssuOut++;} +// if(r18S!=null){bsw.print("\t18S:").print(r18S.length); ssuOut++;} +// if(r16S!=null){bsw.print("\n#16S:").print(r16S);} +// if(r18S!=null){bsw.print("\n#18S:").print(r18S);} +// +// if(verbose){System.err.println("Found 16S: "+(r16S!=null)+"; found 18S: "+(r18S!=null));} +// } +// bsw.println(); +// }else if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#18S:") || Tools.startsWith(line, "#SSU:")){ +// bsw.println(line); +// ssuIn++; +// ssuOut++; +// }else{ +// assert(Tools.startsWith(line, "##")) : new String(line); +// bsw.println(line); +// } +// }else{ +// bsw.println(line); +// } +// } +// line=bf.nextLine(); +// } +// } + + private void processInner(ByteFile bf, ByteStreamWriter bsw){ + SSUMap.load(outstream); + + if(verbose){ + System.err.println("Loaded SSUMap; |16S|="+SSUMap.r16SCount()+", |18S|="+SSUMap.r18SCount()); + } + + byte[] line=bf.nextLine(); +// ByteBuilder bb=new ByteBuilder(); + +// final byte[] ssuBytes="SSU:".getBytes(); +// final byte[] r16SBytes="16S:".getBytes(); +// final byte[] r18SBytes="18S:".getBytes(); + + SketchHeader header=null; + while(line!=null){ + if(line.length>0){ + if(maxLines>0 && linesProcessed>=maxLines){break;} + linesProcessed++; + bytesProcessed+=(line.length+1); + + final boolean isHeader=(line[0]=='#'); + + if(isHeader){ + if(Tools.startsWith(line, "#SZ:")){ + assert(header==null) : "\nReplacing this:\n"+header.toBytes()+"\nWith this:\n"+new String(line)+"\n"; + header=new SketchHeader(line); + sketchCount++; + }else if(Tools.startsWith(line, "##")){ + bsw.println(line); + + linesOut++; + bytesOut+=(line.length+1); + }else{ + header.addLine(line); + } + }else{ + if(header!=null){ + try { + processHeader(header); + } catch (Throwable e) { + e.printStackTrace(); + assert(false) : header.toBytes(); + } + r16Sout+=(header.r16S==null ? 0 : 1); + r18Sout+=(header.r18S==null ? 0 : 1); + linesOut+=1+(header.r16S==null ? 0 : 1)+(header.r18S==null ? 0 : 1); + ByteBuilder bb=header.toBytes(); + bytesOut+=(bb.length+1); + bsw.println(bb); + header=null; + } + bsw.println(line); + + linesOut++; + bytesOut+=(line.length+1); + } + } + line=bf.nextLine(); + } + } + + void processHeader(SketchHeader header){ + + if(verbose){System.err.println("Processing tid "+header.tid+":\n"+header.toBytes()+"\n");} + + final boolean euk=(tree!=null && header.tid>0 && header.tid<SketchObject.minFakeID) ? tree.isEukaryote(header.tid) : false; + final boolean prok=(tree!=null && header.tid>0 && header.tid<SketchObject.minFakeID) ? tree.isProkaryote(header.tid) : false; + if(useSSUMapOnly || (useSSUMapOnlyEuks && euk) || (useSSUMapOnlyProks && prok)){header.r16S=header.r18S=null;} + if(header.tid>0){ + final boolean preferMap=(preferSSUMap || (preferSSUMapEuks && euk) || (preferSSUMapProks && prok)); + byte[] r16S=(SSUMap.r16SMap==null ? null : SSUMap.r16SMap.get(header.tid)); + byte[] r18S=(SSUMap.r18SMap==null ? null : SSUMap.r18SMap.get(header.tid)); + if(r16S!=null && (preferMap || header.r16S==null)){ + header.r16S=r16S; + r16SfromMap++; + } + if(r18S!=null && (preferMap || header.r18S==null)){ + header.r18S=r18S; + r18SfromMap++; + } + } + if(clear16S || (clear16SEuks && euk) || (clear16SProks && prok)){header.r16S=null;} + if(clear18S || (clear18SEuks && euk) || (clear18SProks && prok)){header.r18S=null;} + } + + int parseTaxID(byte[] line){ + String[] split=Tools.tabPattern.split(new String(line)); + for(String s : split){ + if(s.startsWith("ID:") || s.startsWith("TAXID:")){ + final int colon=s.indexOf(':'); + final String sub=s.substring(colon+1); + return Integer.parseInt(sub); + } + } + return -1; + } + + /*--------------------------------------------------------------*/ + + //A very limited parser + private class SketchHeader { + + SketchHeader(byte[] line){ + this(new String(line, 1, line.length-1)); + } + + SketchHeader(String line){ + if(line.charAt(0)=='#'){line=line.substring(1);} + assert(line.startsWith("SZ:")); + String[] split=Tools.tabPattern.split(line); + fields=new ArrayList<String>(line.length()+2); + int tid_=-1; + for(String s : split){ + if(s.startsWith("16S:") || s.startsWith("18S:") || s.startsWith("SSU:")){ + //do nothing + }else{ + if(s.startsWith("ID:") || s.startsWith("TAXID:")){ + final int colon=s.indexOf(':'); + final String sub=s.substring(colon+1); + tid_=Integer.parseInt(sub); + } + fields.add(s); + } + } + tid=tid_; + } + + void addLine(byte[] line){ + assert(line[0]=='#'); + assert(line[1]=='1' || line[1]=='S') : new String(line); + if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){ + assert(r16S==null); + r16S=Arrays.copyOfRange(line, 5, line.length); + r16Sin++; + }else if(Tools.startsWith(line, "#18S:")){ + assert(r18S==null); + r18S=Arrays.copyOfRange(line, 5, line.length); + r18Sin++; + }else{ + assert(false) : new String(line); + } + } + + ByteBuilder toBytes(){ + ByteBuilder bb=new ByteBuilder(1000); + bb.append('#'); + for(int i=0; i<fields.size(); i++){ + if(i>0){bb.tab();} + bb.append(fields.get(i)); + } + if(r16S!=null){bb.tab().append("16S:").append(r16S.length);} + if(r18S!=null){bb.tab().append("18S:").append(r18S.length);} + + if(r16S!=null){bb.nl().append("#16S:").append(r16S);} + if(r18S!=null){bb.nl().append("#18S:").append(r18S);} + return bb; + } + + final int tid; + ArrayList<String> fields; + byte[] r16S; + byte[] r18S; + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private String in1=null; + private String out1=null; + private String r16SFile="auto"; + private String r18SFile="auto"; + private String treeFile="auto"; + + boolean preferSSUMap=false; + boolean preferSSUMapEuks=false; + boolean preferSSUMapProks=false; + boolean useSSUMapOnly=false; + boolean useSSUMapOnlyEuks=false; + boolean useSSUMapOnlyProks=false; + boolean clear16S=false; + boolean clear18S=false; + boolean clear16SEuks=false; + boolean clear18SEuks=false; + boolean clear16SProks=false; + boolean clear18SProks=false; + + /*--------------------------------------------------------------*/ + + private long linesProcessed=0; + private long linesOut=0; + private long bytesProcessed=0; + private long bytesOut=0; + + private long sketchCount=0; + + private long r16Sin=0; + private long r16Sout=0; + private long r16SfromMap=0; + private long r18Sin=0; + private long r18Sout=0; + private long r18SfromMap=0; + + private long maxLines=Long.MAX_VALUE; + + /*--------------------------------------------------------------*/ + /*---------------- Final Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private final FileFormat ffin1; + private final FileFormat ffout1; + + private final TaxTree tree; + + /*--------------------------------------------------------------*/ + /*---------------- Common Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private PrintStream outstream=System.err; + public static boolean verbose=false; + public boolean errorState=false; + private boolean overwrite=false; + private boolean append=false; + +}