Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/AnalyzeAccession.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/AnalyzeAccession.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,446 @@ +package tax; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map.Entry; + +import fileIO.ByteFile; +import fileIO.ByteFile1; +import fileIO.ByteFile2; +import fileIO.ByteStreamWriter; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextFile; +import shared.Parse; +import shared.Parser; +import shared.PreParser; +import shared.Shared; +import shared.Timer; +import shared.Tools; +import stream.ConcurrentGenericReadInputStream; +import stream.FastaReadInputStream; +import structures.ByteBuilder; +import structures.ListNum; +import structures.StringNum; +import template.Accumulator; +import template.ThreadWaiter; + +/** + * Counts patterns in Accessions. + * Handles hashing for Accession to TaxID lookups. + * @author Brian Bushnell + * @date May 9, 2018 + * + */ +public class AnalyzeAccession implements Accumulator<AnalyzeAccession.ProcessThread> { + + public static void main(String[] args){ + //Start a timer immediately upon code entrance. + Timer t=new Timer(); + + //Create an instance of this class + AnalyzeAccession x=new AnalyzeAccession(args); + + //Run the object + x.process(t); + + //Close the print stream if it was redirected + Shared.closeStream(x.outstream); + } + + public AnalyzeAccession(String[] args){ + + {//Preparse block for help, config files, and outstream + PreParser pp=new PreParser(args, getClass(), false); + args=pp.args; + outstream=pp.outstream; + } + + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=Shared.threads(); + + Parser parser=new Parser(); + for(int i=0; i<args.length; i++){ + String arg=args[i]; + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + + if(a.equals("verbose")){ + verbose=Parse.parseBoolean(b); + ByteFile1.verbose=verbose; + ByteFile2.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + ConcurrentGenericReadInputStream.verbose=verbose; + stream.FastqReadInputStream.verbose=verbose; + ReadWrite.verbose=verbose; + }else if(a.equals("in")){ + if(b==null){in.clear();} + else{ + String[] split2=b.split(","); + for(String s2 : split2){ + in.add(s2); + } + } + }else if(a.equals("perfile")){ + perFile=Parse.parseBoolean(b); + }else if(b==null && new File(arg).exists()){ + in.add(arg); + }else if(parser.parse(arg, a, b)){ + //do nothing + }else{ + outstream.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + {//Process parser fields + overwrite=parser.overwrite; + append=parser.append; + + out=parser.out1; + } + + assert(FastaReadInputStream.settingsOK()); + + if(in==null){throw new RuntimeException("Error - at least one input file is required.");} + +// if(!ByteFile.FORCE_MODE_BF2){ +// ByteFile.FORCE_MODE_BF2=false; +// ByteFile.FORCE_MODE_BF1=true; +// } + + if(out!=null && out.equalsIgnoreCase("null")){out=null;} + + if(!Tools.testOutputFiles(overwrite, append, false, out)){ + outstream.println((out==null)+", "+out); + throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out+"\n"); + } + + ffout=FileFormat.testOutput(out, FileFormat.TXT, null, true, overwrite, append, false); + ffina=new FileFormat[in.size()]; + for(int i=0; i<in.size(); i++){ + ffina[i]=FileFormat.testInput(in.get(i), FileFormat.TXT, null, true, false); + } + } + + void process(Timer t){ + + if(perFile) { + process_perFile(); + }else{ + for(FileFormat ffin : ffina){ + process_inner(ffin); + } + } + + if(ffout!=null){ + ByteStreamWriter bsw=new ByteStreamWriter(ffout); + bsw.println("#Pattern\tCount\tCombos\tBits"); + ArrayList<StringNum> list=new ArrayList<StringNum>(); + list.addAll(countMap.values()); + Collections.sort(list); + Collections.reverse(list); + for(StringNum sn : list){ + double combos=1; + for(int i=0; i<sn.s.length(); i++){ + char c=sn.s.charAt(i); + if(c=='D'){combos*=10;} + else if(c=='L'){combos*=26;} + } + bsw.print(sn.toString().getBytes()); + bsw.println("\t"+(long)combos+"\t"+String.format(Locale.ROOT, "%.2f", Tools.log2(combos))); + } + bsw.start(); + errorState|=bsw.poisonAndWait(); + } + + t.stop(); + + outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8)); + + outstream.println(); + outstream.println("Valid Lines: \t"+linesOut); + outstream.println("Invalid Lines: \t"+(linesProcessed-linesOut)); + + if(errorState){ + throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + void process_inner(FileFormat ffin){ + + ByteFile bf=ByteFile.makeByteFile(ffin); + + final int threads=Tools.min(8, Shared.threads()); + ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); + for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));} + boolean success=ThreadWaiter.startAndWait(alpt, this); + errorState|=!success; + } + + + void process_perFile(){ + ArrayList<ArrayList<ProcessThread>> perFileList=new ArrayList<ArrayList<ProcessThread>>(ffina.length); + for(FileFormat ffin : ffina) { + ByteFile bf=ByteFile.makeByteFile(ffin); + + final int threads=Tools.min(16, Shared.threads()); + ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); + for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));} + perFileList.add(alpt); + ThreadWaiter.startThreads(alpt); + } + for(ArrayList<ProcessThread> alpt : perFileList){ + boolean success=ThreadWaiter.waitForThreads(alpt, this); + errorState|=!success; + } + } + + /*--------------------------------------------------------------*/ + + static class ProcessThread extends Thread { + + ProcessThread(ByteFile bf_){ + bf=bf_; + } + + @Override + public void run() { + final StringBuilder buffer=new StringBuilder(128); + for(ListNum<byte[]> lines=bf.nextList(); lines!=null; lines=bf.nextList()){ + assert(lines.size()>0); + if(lines.id==0){ + //This one is not really important; the header could be missing. + assert(Tools.startsWith(lines.get(0), "accession")) : bf.name()+"[0]: "+new String(lines.get(0)); + }else{ + assert(!Tools.startsWith(lines.get(0), "accession")) : bf.name()+"["+lines.id+"]: "+new String(lines.get(0)); + } + for(byte[] line : lines){ + if(line.length>0){ + linesProcessedT++; + bytesProcessedT+=(line.length+1); + + boolean valid=lines.id>0 || !(Tools.startsWith(line, "accession")); //Skips test for most lines + + if(valid){ + linesOutT++; + increment(line, buffer); + } + } + } + } + } + + void increment(byte[] line, StringBuilder buffer){ + buffer.setLength(0); + for(int i=0; i<line.length; i++){ + final byte b=line[i]; + if(b==' ' || b=='\t' || b=='.' || b==':'){break;} + final char b2=(char)remap[b]; + assert(b2!='?' || b=='+') : "unprocessed symbol in "+new String(line)+"\n"+"'"+(char)b+"'"; + buffer.append(b2); + } + String key=buffer.toString(); + StringNum value=countMapT.get(key); + if(value!=null){value.increment();} + else{countMapT.put(key, new StringNum(key, 1));} + } + + private HashMap<String, StringNum> countMapT=new HashMap<String, StringNum>(); + private final ByteFile bf; + long linesProcessedT=0; + long linesOutT=0; + long bytesProcessedT=0; + + } + + /*--------------------------------------------------------------*/ + + @Override + public void accumulate(ProcessThread t) { + linesProcessed+=t.linesProcessedT; + linesOut+=t.linesOutT; + bytesProcessed+=t.bytesProcessedT; + for(Entry<String, StringNum> e : t.countMapT.entrySet()){ + StringNum value=e.getValue(); + final String key=e.getKey(); + StringNum old=countMap.get(key); + if(old==null){countMap.put(key, value);} + else{old.add(value);} + } + } + + @Override + public boolean success() { + return !errorState; + } + + /*--------------------------------------------------------------*/ + + public static long combos(String s){ + double combos=1; + for(int i=0; i<s.length(); i++){ + char c=s.charAt(i); + if(c=='D'){combos*=10;} + else if(c=='L'){combos*=26;} + } + return (combos>=Long.MAX_VALUE ? Long.MAX_VALUE : (long)Math.ceil(combos)); + } + + public static long combos(byte[] s){ + double combos=1; + for(int i=0; i<s.length; i++){ + byte c=s[i]; + if(c=='D'){combos*=10;} + else if(c=='L'){combos*=26;} + } + return (combos>=Long.MAX_VALUE ? -1 : (long)Math.ceil(combos)); + } + + /*--------------------------------------------------------------*/ + + public static HashMap<String, Integer> loadCodeMap(String fname){ + assert(codeMap==null); + TextFile tf=new TextFile(fname); + ArrayList<String> list=new ArrayList<String>(); + for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ + if(!line.startsWith("#")){ + String[] split=line.split("\t"); + list.add(split[0]); + } + } + HashMap<String, Integer> map=new HashMap<String, Integer>(list.size()*3); + codeBits=(int)Math.ceil(Tools.log2(list.size())); + final int patternBits=63-codeBits; + final long maxCombos=((1L<<(patternBits-1))-1); + for(int i=0; i<list.size(); i++){ + String s=list.get(i); + longestPattern=Tools.max(longestPattern, s.length()); + long combos=combos(s); + if(combos<0 || combos>=maxCombos){map.put(s, -1);} + else{map.put(s, i);} + } + codeMap=map; + return map; + } + + public static long digitize(String s){ + String pattern=remap(s); + Integer code=codeMap.get(pattern); + if(code==null){return -2;} + if(code.intValue()<0){return -1;} + + long number=0; + for(int i=0; i<pattern.length(); i++){ + char c=s.charAt(i); + char p=pattern.charAt(i); + if(p=='-' || p=='?'){ + //do nothing + }else if(p=='D'){ + number=(number*10)+(c-'0'); + }else if(p=='L'){ + number=(number*26)+(Tools.toUpperCase(c)-'A'); + }else{ + assert(false) : s; + } + } + number=(number<<codeBits)+code; + return number; + } + + public static long digitize(byte[] s){ + String pattern=remap(s); + Integer code=codeMap.get(pattern); + if(code==null){return -2;} + if(code.intValue()<0){return -1;} + + long number=0; + for(int i=0; i<pattern.length(); i++){ + byte c=s[i]; + char p=pattern.charAt(i); + if(p=='-' || p=='?'){ + //do nothing + }else if(p=='D'){ + number=(number*10)+(c-'0'); + }else if(p=='L'){ + number=(number*26)+(Tools.toUpperCase(c)-'A'); + }else{ + assert(false) : new String(s); + } + } + number=(number<<codeBits)+code; + return number; + } + + public static String remap(String s){ + if(s==null || s.length()<1){return "";} + ByteBuilder buffer=new ByteBuilder(s.length()); + for(int i=0; i<s.length(); i++){ + final char b=s.charAt(i); + if(b==' ' || b=='\t' || b=='.' || b==':'){break;} + buffer.append((char)remap[b]); + } + return buffer.toString(); + } + + public static String remap(byte[] s){ + ByteBuilder buffer=new ByteBuilder(s.length); + for(int i=0; i<s.length; i++){ + final byte b=s[i]; + if(b==' ' || b=='\t' || b=='.' || b==':'){break;} + buffer.append((char)remap[b]); + } + return buffer.toString(); + } + + /*--------------------------------------------------------------*/ + + private ArrayList<String> in=new ArrayList<String>(); + private String out=null; + private boolean perFile=true; + + /*--------------------------------------------------------------*/ + + private HashMap<String, StringNum> countMap=new HashMap<String, StringNum>(); + public static HashMap<String, Integer> codeMap; + private static int codeBits=-1; + private static int longestPattern=-1; + + private long linesProcessed=0; + private long linesOut=0; + private long bytesProcessed=0; + private long bytesOut=0; + + /*--------------------------------------------------------------*/ + + private final FileFormat[] ffina; + private final FileFormat ffout; + + private static final byte[] remap=makeRemap(); + + private static byte[] makeRemap(){ + byte[] array=new byte[128]; + Arrays.fill(array, (byte)'?'); + for(int i='A'; i<='Z'; i++){array[i]='L';} + for(int i='a'; i<='z'; i++){array[i]='L';} + for(int i='0'; i<='9'; i++){array[i]='D';} + array['_']=array['-']='-'; + return array; + } + + /*--------------------------------------------------------------*/ + + private PrintStream outstream=System.err; + public static boolean verbose=false; + public boolean errorState=false; + private boolean overwrite=false; + private boolean append=false; + +}