view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/tax/AnalyzeAccession.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line source
package tax;

import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map.Entry;

import fileIO.ByteFile;
import fileIO.ByteFile1;
import fileIO.ByteFile2;
import fileIO.ByteStreamWriter;
import fileIO.FileFormat;
import fileIO.ReadWrite;
import fileIO.TextFile;
import shared.Parse;
import shared.Parser;
import shared.PreParser;
import shared.Shared;
import shared.Timer;
import shared.Tools;
import stream.ConcurrentGenericReadInputStream;
import stream.FastaReadInputStream;
import structures.ByteBuilder;
import structures.ListNum;
import structures.StringNum;
import template.Accumulator;
import template.ThreadWaiter;

/**
 * Counts patterns in Accessions.
 * Handles hashing for Accession to TaxID lookups.
 * @author Brian Bushnell
 * @date May 9, 2018
 *
 */
public class AnalyzeAccession implements Accumulator<AnalyzeAccession.ProcessThread> {
	
	public static void main(String[] args){
		//Start a timer immediately upon code entrance.
		Timer t=new Timer();
		
		//Create an instance of this class
		AnalyzeAccession x=new AnalyzeAccession(args);
		
		//Run the object
		x.process(t);
		
		//Close the print stream if it was redirected
		Shared.closeStream(x.outstream);
	}
	
	public AnalyzeAccession(String[] args){
		
		{//Preparse block for help, config files, and outstream
			PreParser pp=new PreParser(args, getClass(), false);
			args=pp.args;
			outstream=pp.outstream;
		}
		
		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
		
		Parser parser=new Parser();
		for(int i=0; i<args.length; i++){
			String arg=args[i];
			String[] split=arg.split("=");
			String a=split[0].toLowerCase();
			String b=split.length>1 ? split[1] : null;

			if(a.equals("verbose")){
				verbose=Parse.parseBoolean(b);
				ByteFile1.verbose=verbose;
				ByteFile2.verbose=verbose;
				stream.FastaReadInputStream.verbose=verbose;
				ConcurrentGenericReadInputStream.verbose=verbose;
				stream.FastqReadInputStream.verbose=verbose;
				ReadWrite.verbose=verbose;
			}else if(a.equals("in")){
				if(b==null){in.clear();}
				else{
					String[] split2=b.split(",");
					for(String s2 : split2){
						in.add(s2);
					}
				}
			}else if(a.equals("perfile")){
				perFile=Parse.parseBoolean(b);
			}else if(b==null && new File(arg).exists()){
				in.add(arg);
			}else if(parser.parse(arg, a, b)){
				//do nothing
			}else{
				outstream.println("Unknown parameter "+args[i]);
				assert(false) : "Unknown parameter "+args[i];
				//				throw new RuntimeException("Unknown parameter "+args[i]);
			}
		}
		
		{//Process parser fields
			overwrite=parser.overwrite;
			append=parser.append;

			out=parser.out1;
		}
		
		assert(FastaReadInputStream.settingsOK());
		
		if(in==null){throw new RuntimeException("Error - at least one input file is required.");}
		
//		if(!ByteFile.FORCE_MODE_BF2){
//			ByteFile.FORCE_MODE_BF2=false;
//			ByteFile.FORCE_MODE_BF1=true;
//		}

		if(out!=null && out.equalsIgnoreCase("null")){out=null;}
		
		if(!Tools.testOutputFiles(overwrite, append, false, out)){
			outstream.println((out==null)+", "+out);
			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out+"\n");
		}

		ffout=FileFormat.testOutput(out, FileFormat.TXT, null, true, overwrite, append, false);
		ffina=new FileFormat[in.size()];
		for(int i=0; i<in.size(); i++){
			ffina[i]=FileFormat.testInput(in.get(i), FileFormat.TXT, null, true, false);
		}
	}
	
	void process(Timer t){

		if(perFile) {
			process_perFile();
		}else{
			for(FileFormat ffin : ffina){
				process_inner(ffin);
			}
		}
		
		if(ffout!=null){
			ByteStreamWriter bsw=new ByteStreamWriter(ffout);
			bsw.println("#Pattern\tCount\tCombos\tBits");
			ArrayList<StringNum> list=new ArrayList<StringNum>();
			list.addAll(countMap.values());
			Collections.sort(list);
			Collections.reverse(list);
			for(StringNum sn : list){
				double combos=1;
				for(int i=0; i<sn.s.length(); i++){
					char c=sn.s.charAt(i);
					if(c=='D'){combos*=10;}
					else if(c=='L'){combos*=26;}
				}
				bsw.print(sn.toString().getBytes());
				bsw.println("\t"+(long)combos+"\t"+String.format(Locale.ROOT, "%.2f", Tools.log2(combos)));
			}
			bsw.start();
			errorState|=bsw.poisonAndWait();
		}
		
		t.stop();
		
		outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8));
		
		outstream.println();
		outstream.println("Valid Lines:       \t"+linesOut);
		outstream.println("Invalid Lines:     \t"+(linesProcessed-linesOut));
		
		if(errorState){
			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
		}
	}
	
	void process_inner(FileFormat ffin){
		
		ByteFile bf=ByteFile.makeByteFile(ffin);
		
		final int threads=Tools.min(8, Shared.threads());
		ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
		for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));}
		boolean success=ThreadWaiter.startAndWait(alpt, this);
		errorState|=!success;
	}
	
	
	void process_perFile(){
		ArrayList<ArrayList<ProcessThread>> perFileList=new ArrayList<ArrayList<ProcessThread>>(ffina.length);
		for(FileFormat ffin : ffina) {
			ByteFile bf=ByteFile.makeByteFile(ffin);

			final int threads=Tools.min(16, Shared.threads());
			ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
			for(int i=0; i<threads; i++){alpt.add(new ProcessThread(bf));}
			perFileList.add(alpt);
			ThreadWaiter.startThreads(alpt);
		}
		for(ArrayList<ProcessThread> alpt : perFileList){
			boolean success=ThreadWaiter.waitForThreads(alpt, this);
			errorState|=!success;
		}
	}
	
	/*--------------------------------------------------------------*/
	
	static class ProcessThread extends Thread {
		
		ProcessThread(ByteFile bf_){
			bf=bf_;
		}
		
		@Override
		public void run() {
			final StringBuilder buffer=new StringBuilder(128);
			for(ListNum<byte[]> lines=bf.nextList(); lines!=null; lines=bf.nextList()){
				assert(lines.size()>0);
				if(lines.id==0){
					//This one is not really important; the header could be missing.
					assert(Tools.startsWith(lines.get(0), "accession")) : bf.name()+"[0]: "+new String(lines.get(0));
				}else{
					assert(!Tools.startsWith(lines.get(0), "accession")) : bf.name()+"["+lines.id+"]: "+new String(lines.get(0));
				}
				for(byte[] line : lines){
					if(line.length>0){
						linesProcessedT++;
						bytesProcessedT+=(line.length+1);
						
						boolean valid=lines.id>0 || !(Tools.startsWith(line, "accession")); //Skips test for most lines
						
						if(valid){
							linesOutT++;
							increment(line, buffer);
						}
					}
				}
			}
		}
		
		void increment(byte[] line, StringBuilder buffer){
			buffer.setLength(0);
			for(int i=0; i<line.length; i++){
				final byte b=line[i];
				if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
				final char b2=(char)remap[b];
				assert(b2!='?' || b=='+') : "unprocessed symbol in "+new String(line)+"\n"+"'"+(char)b+"'";
				buffer.append(b2);
			}
			String key=buffer.toString();
			StringNum value=countMapT.get(key);
			if(value!=null){value.increment();}
			else{countMapT.put(key, new StringNum(key, 1));}
		}
		
		private HashMap<String, StringNum> countMapT=new HashMap<String, StringNum>();
		private final ByteFile bf;
		long linesProcessedT=0;
		long linesOutT=0;
		long bytesProcessedT=0;
		
	}
	
	/*--------------------------------------------------------------*/

	@Override
	public void accumulate(ProcessThread t) {
		linesProcessed+=t.linesProcessedT;
		linesOut+=t.linesOutT;
		bytesProcessed+=t.bytesProcessedT;
		for(Entry<String, StringNum> e : t.countMapT.entrySet()){
			StringNum value=e.getValue();
			final String key=e.getKey();
			StringNum old=countMap.get(key);
			if(old==null){countMap.put(key, value);}
			else{old.add(value);}
		}
	}

	@Override
	public boolean success() {
		return !errorState;
	}
	
	/*--------------------------------------------------------------*/
	
	public static long combos(String s){
		double combos=1;
		for(int i=0; i<s.length(); i++){
			char c=s.charAt(i);
			if(c=='D'){combos*=10;}
			else if(c=='L'){combos*=26;}
		}
		return (combos>=Long.MAX_VALUE ? Long.MAX_VALUE : (long)Math.ceil(combos));
	}
	
	public static long combos(byte[] s){
		double combos=1;
		for(int i=0; i<s.length; i++){
			byte c=s[i];
			if(c=='D'){combos*=10;}
			else if(c=='L'){combos*=26;}
		}
		return (combos>=Long.MAX_VALUE ? -1 : (long)Math.ceil(combos));
	}
	
	/*--------------------------------------------------------------*/
	
	public static HashMap<String, Integer> loadCodeMap(String fname){
		assert(codeMap==null);
		TextFile tf=new TextFile(fname);
		ArrayList<String> list=new ArrayList<String>();
		for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
			if(!line.startsWith("#")){
				String[] split=line.split("\t");
				list.add(split[0]);
			}
		}
		HashMap<String, Integer> map=new HashMap<String, Integer>(list.size()*3);
		codeBits=(int)Math.ceil(Tools.log2(list.size()));
		final int patternBits=63-codeBits;
		final long maxCombos=((1L<<(patternBits-1))-1);
		for(int i=0; i<list.size(); i++){
			String s=list.get(i);
			longestPattern=Tools.max(longestPattern, s.length());
			long combos=combos(s);
			if(combos<0 || combos>=maxCombos){map.put(s, -1);}
			else{map.put(s, i);}
		}
		codeMap=map;
		return map;
	}
	
	public static long digitize(String s){
		String pattern=remap(s);
		Integer code=codeMap.get(pattern);
		if(code==null){return -2;}
		if(code.intValue()<0){return -1;}
		
		long number=0;
		for(int i=0; i<pattern.length(); i++){
			char c=s.charAt(i);
			char p=pattern.charAt(i);
			if(p=='-' || p=='?'){
				//do nothing
			}else if(p=='D'){
				number=(number*10)+(c-'0');
			}else if(p=='L'){
				number=(number*26)+(Tools.toUpperCase(c)-'A');
			}else{
				assert(false) : s;
			}
		}
		number=(number<<codeBits)+code;
		return number;
	}
	
	public static long digitize(byte[] s){
		String pattern=remap(s);
		Integer code=codeMap.get(pattern);
		if(code==null){return -2;}
		if(code.intValue()<0){return -1;}
		
		long number=0;
		for(int i=0; i<pattern.length(); i++){
			byte c=s[i];
			char p=pattern.charAt(i);
			if(p=='-' || p=='?'){
				//do nothing
			}else if(p=='D'){
				number=(number*10)+(c-'0');
			}else if(p=='L'){
				number=(number*26)+(Tools.toUpperCase(c)-'A');
			}else{
				assert(false) : new String(s);
			}
		}
		number=(number<<codeBits)+code;
		return number;
	}
	
	public static String remap(String s){
		if(s==null || s.length()<1){return "";}
		ByteBuilder buffer=new ByteBuilder(s.length());
		for(int i=0; i<s.length(); i++){
			final char b=s.charAt(i);
			if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
			buffer.append((char)remap[b]);
		}
		return buffer.toString();
	}
	
	public static String remap(byte[] s){
		ByteBuilder buffer=new ByteBuilder(s.length);
		for(int i=0; i<s.length; i++){
			final byte b=s[i];
			if(b==' ' || b=='\t' || b=='.' || b==':'){break;}
			buffer.append((char)remap[b]);
		}
		return buffer.toString();
	}
	
	/*--------------------------------------------------------------*/
	
	private ArrayList<String> in=new ArrayList<String>();
	private String out=null;
	private boolean perFile=true;
	
	/*--------------------------------------------------------------*/

	private HashMap<String, StringNum> countMap=new HashMap<String, StringNum>();
	public static HashMap<String, Integer> codeMap;
	private static int codeBits=-1;
	private static int longestPattern=-1;
	
	private long linesProcessed=0;
	private long linesOut=0;
	private long bytesProcessed=0;
	private long bytesOut=0;
	
	/*--------------------------------------------------------------*/
	
	private final FileFormat[] ffina;
	private final FileFormat ffout;
	
	private static final byte[] remap=makeRemap();
	
	private static byte[] makeRemap(){
		byte[] array=new byte[128];
		Arrays.fill(array, (byte)'?');
		for(int i='A'; i<='Z'; i++){array[i]='L';}
		for(int i='a'; i<='z'; i++){array[i]='L';}
		for(int i='0'; i<='9'; i++){array[i]='D';}
		array['_']=array['-']='-';
		return array;
	}
	
	/*--------------------------------------------------------------*/
	
	private PrintStream outstream=System.err;
	public static boolean verbose=false;
	public boolean errorState=false;
	private boolean overwrite=false;
	private boolean append=false;
	
}