jpayne@68: package prok; jpayne@68: jpayne@68: import java.io.File; jpayne@68: jpayne@68: import dna.AminoAcid; jpayne@68: import dna.Data; jpayne@68: import fileIO.FileFormat; jpayne@68: import fileIO.ReadWrite; jpayne@68: import shared.Parse; jpayne@68: import shared.Tools; jpayne@68: import stream.ConcurrentReadInputStream; jpayne@68: import stream.Read; jpayne@68: import stream.ReadInputStream; jpayne@68: import structures.ListNum; jpayne@68: import structures.LongHashSet; jpayne@68: jpayne@68: /** Contains a lot of statics and static methods for gene-calling */ jpayne@68: public abstract class ProkObject { jpayne@68: jpayne@68: public static boolean parse(String arg, String a, String b){ jpayne@68: if(a.equalsIgnoreCase("16sstartslop") || a.equalsIgnoreCase("ssustartslop")){ jpayne@68: ssuStartSlop=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("23sstartslop") || a.equalsIgnoreCase("lsustartslop")){ jpayne@68: lsuStartSlop=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("5sstartslop")){ jpayne@68: r5SStartSlop=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("16sstopslop") || a.equalsIgnoreCase("ssustopslop")){ jpayne@68: ssuStopSlop=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("23sstopslop") || a.equalsIgnoreCase("lsustopslop")){ jpayne@68: lsuStopSlop=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("5sstopslop")){ jpayne@68: r5SStopSlop=Integer.parseInt(b); jpayne@68: }else if(a.equals("plus")){ jpayne@68: PROCESS_PLUS_STRAND=Parse.parseBoolean(b); jpayne@68: }else if(a.equals("minus")){ jpayne@68: PROCESS_MINUS_STRAND=Parse.parseBoolean(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equalsIgnoreCase("min16SIdentity") || a.equalsIgnoreCase("min16SId")) { jpayne@68: min16SIdentity=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("min18SIdentity") || a.equalsIgnoreCase("min18SId")) { jpayne@68: min18SIdentity=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("min23SIdentity") || a.equalsIgnoreCase("min23SId")) { jpayne@68: min23SIdentity=Float.parseFloat(b); jpayne@68: }else if(a.equalsIgnoreCase("min5SIdentity") || a.equalsIgnoreCase("min5SId")) { jpayne@68: min5SIdentity=Float.parseFloat(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equalsIgnoreCase("align16s") || a.equalsIgnoreCase("load16SSequence")){ jpayne@68: load16SSequence=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("align23s") || a.equalsIgnoreCase("load23SSequence")){ jpayne@68: load23SSequence=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("align18s") || a.equalsIgnoreCase("load18SSequence")){ jpayne@68: load18SSequence=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("align5s") || a.equalsIgnoreCase("load5SSequence")){ jpayne@68: load5SSequence=Parse.parseBoolean(b); jpayne@68: } jpayne@68: jpayne@68: else if(a.equalsIgnoreCase("load16skmers") || a.equalsIgnoreCase("load18skmers") || a.equalsIgnoreCase("loadssukmers")){ jpayne@68: loadSSUkmers=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("load23skmers") || a.equalsIgnoreCase("load28skmers") || a.equalsIgnoreCase("loadlsukmers")){ jpayne@68: loadLSUkmers=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("load5skmers")){ jpayne@68: load5Skmers=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("loadtrnakmers")){ jpayne@68: loadtRNAkmers=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("klongtrna")){ jpayne@68: kLongTRna=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("longkmers")){ jpayne@68: loadSSUkmers=loadLSUkmers=load5Skmers=loadtRNAkmers=Parse.parseBoolean(b); jpayne@68: }else if(a.equalsIgnoreCase("klong5s")){ jpayne@68: kLong5S=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("klong16s") || a.equalsIgnoreCase("klong18s") || a.equalsIgnoreCase("klongssu")){ jpayne@68: kLongSSU=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("klong23s") || a.equalsIgnoreCase("klong28s") || a.equalsIgnoreCase("klonglsu")){ jpayne@68: kLongLSU=Integer.parseInt(b); jpayne@68: }else if(a.equalsIgnoreCase("klongtrna")){ jpayne@68: kLongTRna=Integer.parseInt(b); jpayne@68: } jpayne@68: jpayne@68: else{ jpayne@68: return false; jpayne@68: } jpayne@68: return true; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public static boolean processType(int type){ jpayne@68: return (type==CDS ? callCDS : type==r16S ? call16S : type==r23S ? call23S : type==r18S ? call18S : type==r5S ? call5S : type==tRNA ? calltRNA : true); jpayne@68: } jpayne@68: jpayne@68: public static int startSlop(int type) { jpayne@68: int slop=(type==r16S ? ssuStartSlop : type==r23S ? lsuStartSlop : type==r18S ? ssuStartSlop : type==r5S ? r5SStartSlop : 9999); jpayne@68: return slop; jpayne@68: } jpayne@68: jpayne@68: public static int stopSlop(int type) { jpayne@68: int slop=(type==r16S ? ssuStopSlop : type==r23S ? lsuStopSlop : type==r18S ? ssuStopSlop : type==r5S ? r5SStopSlop : 9999); jpayne@68: return slop; jpayne@68: } jpayne@68: jpayne@68: public static float minID(int type) { jpayne@68: float minIdentity=(type==r16S ? min16SIdentity : type==r23S ? min23SIdentity : type==r18S ? min18SIdentity : type==r5S ? min5SIdentity : 0); jpayne@68: return minIdentity; jpayne@68: } jpayne@68: jpayne@68: public static Read[] consensusReads(int type) { jpayne@68: Read[] consensusReads=(type==r16S ? r16SSequence : type==r23S ? r23SSequence : type==r18S ? r18SSequence : type==r5S ? r5SSequence : null); jpayne@68: return consensusReads; jpayne@68: } jpayne@68: jpayne@68: public static LongHashSet kmerSet(int type) { jpayne@68: LongHashSet set=(type==tRNA ? trnaKmers : type==r16S ? ssuKmers : type==r23S ? lsuKmers : type==r5S ? r5SKmers : type==r18S ? ssuKmers : null); jpayne@68: return set; jpayne@68: } jpayne@68: jpayne@68: public static int kLongLen(int type) { jpayne@68: int kLongLen=(type==tRNA ? kLongTRna : type==r16S ? kLongSSU : type==r23S ? kLongLSU : type==r5S ? kLong5S : type==r18S ? kLongSSU : -1); jpayne@68: return kLongLen; jpayne@68: } jpayne@68: jpayne@68: public static int flagToType(int flag) { jpayne@68: return Integer.numberOfTrailingZeros(flag)+1; jpayne@68: } jpayne@68: jpayne@68: public static byte typeToFlag(int type) { jpayne@68: assert(type<=6); jpayne@68: return (byte)(1<<(type-1)); jpayne@68: } jpayne@68: jpayne@68: public static boolean callType(int type){//TODO: Turn these functions into array lookups jpayne@68: if(type==CDS){return callCDS;} jpayne@68: else if(type==tRNA){return calltRNA;} jpayne@68: else if(type==r16S){return call16S;} jpayne@68: else if(type==r23S){return call23S;} jpayne@68: else if(type==r5S){return call5S;} jpayne@68: else if(type==r18S){return call18S;} jpayne@68: assert(false) : type; jpayne@68: return false; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Long Kmers ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public static synchronized void loadLongKmers(){ jpayne@68: // assert(ssuKmers==null); jpayne@68: // assert(false) : load5Skmers+", "+kLong5s; jpayne@68: if(loadedLongKmers){return;} jpayne@68: if(loadSSUkmers){ssuKmers=loadLongKmersByType(kLongSSU, "ssu");} jpayne@68: if(loadLSUkmers){lsuKmers=loadLongKmersByType(kLongLSU, "lsu");} jpayne@68: if(load5Skmers){r5SKmers=loadLongKmersByType(kLong5S, "5S");} jpayne@68: if(loadtRNAkmers){trnaKmers=loadLongKmersByType(kLongTRna, "tRNA");} jpayne@68: loadedLongKmers=true; jpayne@68: } jpayne@68: jpayne@68: // private static LongHashSet loadLongKmers(StatsContainer sc, int k, String prefix){ jpayne@68: // String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa"); jpayne@68: // if(!new File(fname).exists()){ jpayne@68: // fname=fname+".gz"; jpayne@68: // if(!new File(fname).exists()){ jpayne@68: // System.err.println("Can't find "+fname); jpayne@68: // return null; jpayne@68: // } jpayne@68: // } jpayne@68: // LongHashSet set=loadLongKmers(fname, k); jpayne@68: // sc.kmerSet=set; jpayne@68: // sc.kLongLen=k; jpayne@68: // return set; jpayne@68: // } jpayne@68: jpayne@68: private static LongHashSet loadLongKmersByType(int k, String prefix){ jpayne@68: String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa", true); jpayne@68: if(!new File(fname).exists()){ jpayne@68: fname=fname+".gz"; jpayne@68: if(!new File(fname).exists()){ jpayne@68: System.err.println("Can't find "+fname); jpayne@68: return null; jpayne@68: } jpayne@68: } jpayne@68: LongHashSet set=loadLongKmers(fname, k); jpayne@68: return set; jpayne@68: } jpayne@68: jpayne@68: private static LongHashSet loadLongKmers(String fname, int k){//TODO: Consider making this a LongHashSet. No reason not to... jpayne@68: FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false); jpayne@68: ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1, false, ff, null); jpayne@68: cris.start(); //Start the stream jpayne@68: // if(verbose){outstream.println("Started cris");} jpayne@68: jpayne@68: LongHashSet set=new LongHashSet(1000); jpayne@68: ListNum ln=cris.nextList(); jpayne@68: while(ln!=null && ln.size()>0){ jpayne@68: processList(ln, set, k); jpayne@68: cris.returnList(ln); jpayne@68: ln=cris.nextList(); jpayne@68: } jpayne@68: if(ln!=null){cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());} jpayne@68: ReadWrite.closeStream(cris); jpayne@68: return set; jpayne@68: } jpayne@68: jpayne@68: private static LongHashSet processList(ListNum ln, LongHashSet set, int k){ jpayne@68: final long mask=~((-1L)<<(2*k)); jpayne@68: for(Read r : ln){ jpayne@68: final byte[] bases=r.bases; jpayne@68: long kmer=0; jpayne@68: int len=0; jpayne@68: for(byte b : bases){ jpayne@68: final int num=AminoAcid.baseToNumber[b]; jpayne@68: if(num>=0){ jpayne@68: len++; jpayne@68: kmer=((kmer<<2)|num)&mask; jpayne@68: if(len>=k){ jpayne@68: set.add(kmer); jpayne@68: } jpayne@68: }else{ jpayne@68: len=0; jpayne@68: } jpayne@68: } jpayne@68: } jpayne@68: return set; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: /*---------------- Consensus Sequence ----------------*/ jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public static synchronized void loadConsensusSequenceFromFile(boolean removeMito, boolean removeChloro){ jpayne@68: if(loadedConsensusSequence){return;} jpayne@68: // assert(r16SSequence==null); jpayne@68: if(load16SSequence){r16SSequence=loadConsensusSequenceType("16S", removeMito, removeChloro);} jpayne@68: if(load18SSequence){r18SSequence=loadConsensusSequenceType("18S", removeMito, removeChloro);} jpayne@68: if(load23SSequence){r23SSequence=loadConsensusSequenceType("23S", removeMito, removeChloro);} jpayne@68: if(load5SSequence){r5SSequence=loadConsensusSequenceType("5S", removeMito, removeChloro);} jpayne@68: if(loadtRNASequence){trnaSequence=loadConsensusSequenceType("tRNA", removeMito, removeChloro);} jpayne@68: loadedConsensusSequence=true; jpayne@68: } jpayne@68: jpayne@68: public static Read[] loadConsensusSequenceType(String prefix, boolean removeMito, boolean removeChloro){ jpayne@68: String fname=null; jpayne@68: fname=Data.findPath("?"+prefix+"_consensus_sequence.fq", false); jpayne@68: if(fname!=null && (fname.endsWith(".jar") || new File(fname).exists())){ jpayne@68: fname=Tools.fixExtension(fname); jpayne@68: }else{ jpayne@68: fname=Data.findPath("?"+prefix+"_consensus_sequence.fa", true); jpayne@68: fname=Tools.fixExtension(fname); jpayne@68: if(!fname.endsWith(".jar") && !new File(fname).exists()){ jpayne@68: System.err.println("Can't find "+fname); jpayne@68: return null; jpayne@68: } jpayne@68: } jpayne@68: Read[] array=loadConsensusSequence(fname); jpayne@68: if(removeMito){array=stripOrganelle(array, "mito");} jpayne@68: if(removeChloro){array=stripOrganelle(array, "plastid");} jpayne@68: return array; jpayne@68: } jpayne@68: jpayne@68: private static Read[] loadConsensusSequence(String fname){ jpayne@68: FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false); jpayne@68: Read[] array=ReadInputStream.toReadArray(ff, -1); jpayne@68: return array; jpayne@68: } jpayne@68: jpayne@68: private static Read[] stripOrganelle(Read[] array, String key){ jpayne@68: int removed=0; jpayne@68: for(int j=0; j0){array=Tools.condenseStrict(array);} jpayne@68: return array; jpayne@68: } jpayne@68: jpayne@68: /*--------------------------------------------------------------*/ jpayne@68: jpayne@68: public static final int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7; jpayne@68: public static String[] typeStrings=new String[] {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; jpayne@68: public static String[] typeStrings2=new String[] {"CDS", "tRNA", "rRNA", "rRNA", "rRNA", "rRNA", "rRNA", "RNA"}; jpayne@68: public static String[] specialTypeStrings=new String[] {null, "tRNA", "16S", "23S", "5S", "18S", "28S", null}; jpayne@68: public static boolean isSpecialType(String type){ jpayne@68: if(type==null){return false;} jpayne@68: for(String s : specialTypeStrings){ jpayne@68: if(type.equalsIgnoreCase(s)){return true;} jpayne@68: } jpayne@68: return false; jpayne@68: } jpayne@68: jpayne@68: public static int kInnerRNA=6; jpayne@68: public static int kStartRNA=3; jpayne@68: public static int kStopRNA=3; jpayne@68: jpayne@68: public static int kLongSSU=15; jpayne@68: public static int kLongLSU=15; jpayne@68: public static int kLong5S=15; jpayne@68: public static int kLongTRna=15; jpayne@68: jpayne@68: public static float min16SIdentity=0.62f; jpayne@68: public static float min23SIdentity=0.60f; jpayne@68: public static float min5SIdentity=0.60f; jpayne@68: public static float min18SIdentity=0.60f; jpayne@68: jpayne@68: static int ssuStartSlop=200; jpayne@68: static int ssuStopSlop=0; jpayne@68: static int lsuStartSlop=220; jpayne@68: static int lsuStopSlop=0; jpayne@68: static int r5SStartSlop=50; jpayne@68: static int r5SStopSlop=50; jpayne@68: jpayne@68: public static boolean callCDS=true; jpayne@68: public static boolean calltRNA=true; jpayne@68: public static boolean call16S=true; jpayne@68: public static boolean call23S=true; jpayne@68: public static boolean call5S=true; jpayne@68: public static boolean call18S=false; jpayne@68: jpayne@68: public static LongHashSet ssuKmers=null; jpayne@68: public static LongHashSet lsuKmers=null; jpayne@68: public static LongHashSet r5SKmers=null; jpayne@68: public static LongHashSet trnaKmers=null; jpayne@68: jpayne@68: public static Read[] trnaSequence=null; jpayne@68: public static Read[] r16SSequence=null; jpayne@68: public static Read[] r23SSequence=null; jpayne@68: public static Read[] r5SSequence=null; jpayne@68: public static Read[] r18SSequence=null; jpayne@68: jpayne@68: public static boolean PROCESS_PLUS_STRAND=true; jpayne@68: public static boolean PROCESS_MINUS_STRAND=true; jpayne@68: jpayne@68: public static boolean loadSSUkmers=true; jpayne@68: public static boolean loadLSUkmers=true; jpayne@68: public static boolean load5Skmers=true; jpayne@68: public static boolean loadtRNAkmers=true; jpayne@68: private static boolean loadedLongKmers=false; jpayne@68: jpayne@68: public static boolean loadtRNASequence=false; jpayne@68: public static boolean load16SSequence=true; jpayne@68: public static boolean load23SSequence=true; jpayne@68: public static boolean load5SSequence=true; jpayne@68: public static boolean load18SSequence=true; jpayne@68: private static boolean loadedConsensusSequence=false; jpayne@68: jpayne@68: }