annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/ProkObject.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package prok;
jpayne@68 2
jpayne@68 3 import java.io.File;
jpayne@68 4
jpayne@68 5 import dna.AminoAcid;
jpayne@68 6 import dna.Data;
jpayne@68 7 import fileIO.FileFormat;
jpayne@68 8 import fileIO.ReadWrite;
jpayne@68 9 import shared.Parse;
jpayne@68 10 import shared.Tools;
jpayne@68 11 import stream.ConcurrentReadInputStream;
jpayne@68 12 import stream.Read;
jpayne@68 13 import stream.ReadInputStream;
jpayne@68 14 import structures.ListNum;
jpayne@68 15 import structures.LongHashSet;
jpayne@68 16
jpayne@68 17 /** Contains a lot of statics and static methods for gene-calling */
jpayne@68 18 public abstract class ProkObject {
jpayne@68 19
jpayne@68 20 public static boolean parse(String arg, String a, String b){
jpayne@68 21 if(a.equalsIgnoreCase("16sstartslop") || a.equalsIgnoreCase("ssustartslop")){
jpayne@68 22 ssuStartSlop=Integer.parseInt(b);
jpayne@68 23 }else if(a.equalsIgnoreCase("23sstartslop") || a.equalsIgnoreCase("lsustartslop")){
jpayne@68 24 lsuStartSlop=Integer.parseInt(b);
jpayne@68 25 }else if(a.equalsIgnoreCase("5sstartslop")){
jpayne@68 26 r5SStartSlop=Integer.parseInt(b);
jpayne@68 27 }else if(a.equalsIgnoreCase("16sstopslop") || a.equalsIgnoreCase("ssustopslop")){
jpayne@68 28 ssuStopSlop=Integer.parseInt(b);
jpayne@68 29 }else if(a.equalsIgnoreCase("23sstopslop") || a.equalsIgnoreCase("lsustopslop")){
jpayne@68 30 lsuStopSlop=Integer.parseInt(b);
jpayne@68 31 }else if(a.equalsIgnoreCase("5sstopslop")){
jpayne@68 32 r5SStopSlop=Integer.parseInt(b);
jpayne@68 33 }else if(a.equals("plus")){
jpayne@68 34 PROCESS_PLUS_STRAND=Parse.parseBoolean(b);
jpayne@68 35 }else if(a.equals("minus")){
jpayne@68 36 PROCESS_MINUS_STRAND=Parse.parseBoolean(b);
jpayne@68 37 }
jpayne@68 38
jpayne@68 39 else if(a.equalsIgnoreCase("min16SIdentity") || a.equalsIgnoreCase("min16SId")) {
jpayne@68 40 min16SIdentity=Float.parseFloat(b);
jpayne@68 41 }else if(a.equalsIgnoreCase("min18SIdentity") || a.equalsIgnoreCase("min18SId")) {
jpayne@68 42 min18SIdentity=Float.parseFloat(b);
jpayne@68 43 }else if(a.equalsIgnoreCase("min23SIdentity") || a.equalsIgnoreCase("min23SId")) {
jpayne@68 44 min23SIdentity=Float.parseFloat(b);
jpayne@68 45 }else if(a.equalsIgnoreCase("min5SIdentity") || a.equalsIgnoreCase("min5SId")) {
jpayne@68 46 min5SIdentity=Float.parseFloat(b);
jpayne@68 47 }
jpayne@68 48
jpayne@68 49 else if(a.equalsIgnoreCase("align16s") || a.equalsIgnoreCase("load16SSequence")){
jpayne@68 50 load16SSequence=Parse.parseBoolean(b);
jpayne@68 51 }else if(a.equalsIgnoreCase("align23s") || a.equalsIgnoreCase("load23SSequence")){
jpayne@68 52 load23SSequence=Parse.parseBoolean(b);
jpayne@68 53 }else if(a.equalsIgnoreCase("align18s") || a.equalsIgnoreCase("load18SSequence")){
jpayne@68 54 load18SSequence=Parse.parseBoolean(b);
jpayne@68 55 }else if(a.equalsIgnoreCase("align5s") || a.equalsIgnoreCase("load5SSequence")){
jpayne@68 56 load5SSequence=Parse.parseBoolean(b);
jpayne@68 57 }
jpayne@68 58
jpayne@68 59 else if(a.equalsIgnoreCase("load16skmers") || a.equalsIgnoreCase("load18skmers") || a.equalsIgnoreCase("loadssukmers")){
jpayne@68 60 loadSSUkmers=Parse.parseBoolean(b);
jpayne@68 61 }else if(a.equalsIgnoreCase("load23skmers") || a.equalsIgnoreCase("load28skmers") || a.equalsIgnoreCase("loadlsukmers")){
jpayne@68 62 loadLSUkmers=Parse.parseBoolean(b);
jpayne@68 63 }else if(a.equalsIgnoreCase("load5skmers")){
jpayne@68 64 load5Skmers=Parse.parseBoolean(b);
jpayne@68 65 }else if(a.equalsIgnoreCase("loadtrnakmers")){
jpayne@68 66 loadtRNAkmers=Parse.parseBoolean(b);
jpayne@68 67 }else if(a.equalsIgnoreCase("klongtrna")){
jpayne@68 68 kLongTRna=Integer.parseInt(b);
jpayne@68 69 }else if(a.equalsIgnoreCase("longkmers")){
jpayne@68 70 loadSSUkmers=loadLSUkmers=load5Skmers=loadtRNAkmers=Parse.parseBoolean(b);
jpayne@68 71 }else if(a.equalsIgnoreCase("klong5s")){
jpayne@68 72 kLong5S=Integer.parseInt(b);
jpayne@68 73 }else if(a.equalsIgnoreCase("klong16s") || a.equalsIgnoreCase("klong18s") || a.equalsIgnoreCase("klongssu")){
jpayne@68 74 kLongSSU=Integer.parseInt(b);
jpayne@68 75 }else if(a.equalsIgnoreCase("klong23s") || a.equalsIgnoreCase("klong28s") || a.equalsIgnoreCase("klonglsu")){
jpayne@68 76 kLongLSU=Integer.parseInt(b);
jpayne@68 77 }else if(a.equalsIgnoreCase("klongtrna")){
jpayne@68 78 kLongTRna=Integer.parseInt(b);
jpayne@68 79 }
jpayne@68 80
jpayne@68 81 else{
jpayne@68 82 return false;
jpayne@68 83 }
jpayne@68 84 return true;
jpayne@68 85 }
jpayne@68 86
jpayne@68 87 /*--------------------------------------------------------------*/
jpayne@68 88
jpayne@68 89 public static boolean processType(int type){
jpayne@68 90 return (type==CDS ? callCDS : type==r16S ? call16S : type==r23S ? call23S : type==r18S ? call18S : type==r5S ? call5S : type==tRNA ? calltRNA : true);
jpayne@68 91 }
jpayne@68 92
jpayne@68 93 public static int startSlop(int type) {
jpayne@68 94 int slop=(type==r16S ? ssuStartSlop : type==r23S ? lsuStartSlop : type==r18S ? ssuStartSlop : type==r5S ? r5SStartSlop : 9999);
jpayne@68 95 return slop;
jpayne@68 96 }
jpayne@68 97
jpayne@68 98 public static int stopSlop(int type) {
jpayne@68 99 int slop=(type==r16S ? ssuStopSlop : type==r23S ? lsuStopSlop : type==r18S ? ssuStopSlop : type==r5S ? r5SStopSlop : 9999);
jpayne@68 100 return slop;
jpayne@68 101 }
jpayne@68 102
jpayne@68 103 public static float minID(int type) {
jpayne@68 104 float minIdentity=(type==r16S ? min16SIdentity : type==r23S ? min23SIdentity : type==r18S ? min18SIdentity : type==r5S ? min5SIdentity : 0);
jpayne@68 105 return minIdentity;
jpayne@68 106 }
jpayne@68 107
jpayne@68 108 public static Read[] consensusReads(int type) {
jpayne@68 109 Read[] consensusReads=(type==r16S ? r16SSequence : type==r23S ? r23SSequence : type==r18S ? r18SSequence : type==r5S ? r5SSequence : null);
jpayne@68 110 return consensusReads;
jpayne@68 111 }
jpayne@68 112
jpayne@68 113 public static LongHashSet kmerSet(int type) {
jpayne@68 114 LongHashSet set=(type==tRNA ? trnaKmers : type==r16S ? ssuKmers : type==r23S ? lsuKmers : type==r5S ? r5SKmers : type==r18S ? ssuKmers : null);
jpayne@68 115 return set;
jpayne@68 116 }
jpayne@68 117
jpayne@68 118 public static int kLongLen(int type) {
jpayne@68 119 int kLongLen=(type==tRNA ? kLongTRna : type==r16S ? kLongSSU : type==r23S ? kLongLSU : type==r5S ? kLong5S : type==r18S ? kLongSSU : -1);
jpayne@68 120 return kLongLen;
jpayne@68 121 }
jpayne@68 122
jpayne@68 123 public static int flagToType(int flag) {
jpayne@68 124 return Integer.numberOfTrailingZeros(flag)+1;
jpayne@68 125 }
jpayne@68 126
jpayne@68 127 public static byte typeToFlag(int type) {
jpayne@68 128 assert(type<=6);
jpayne@68 129 return (byte)(1<<(type-1));
jpayne@68 130 }
jpayne@68 131
jpayne@68 132 public static boolean callType(int type){//TODO: Turn these functions into array lookups
jpayne@68 133 if(type==CDS){return callCDS;}
jpayne@68 134 else if(type==tRNA){return calltRNA;}
jpayne@68 135 else if(type==r16S){return call16S;}
jpayne@68 136 else if(type==r23S){return call23S;}
jpayne@68 137 else if(type==r5S){return call5S;}
jpayne@68 138 else if(type==r18S){return call18S;}
jpayne@68 139 assert(false) : type;
jpayne@68 140 return false;
jpayne@68 141 }
jpayne@68 142
jpayne@68 143 /*--------------------------------------------------------------*/
jpayne@68 144 /*---------------- Long Kmers ----------------*/
jpayne@68 145 /*--------------------------------------------------------------*/
jpayne@68 146
jpayne@68 147 public static synchronized void loadLongKmers(){
jpayne@68 148 // assert(ssuKmers==null);
jpayne@68 149 // assert(false) : load5Skmers+", "+kLong5s;
jpayne@68 150 if(loadedLongKmers){return;}
jpayne@68 151 if(loadSSUkmers){ssuKmers=loadLongKmersByType(kLongSSU, "ssu");}
jpayne@68 152 if(loadLSUkmers){lsuKmers=loadLongKmersByType(kLongLSU, "lsu");}
jpayne@68 153 if(load5Skmers){r5SKmers=loadLongKmersByType(kLong5S, "5S");}
jpayne@68 154 if(loadtRNAkmers){trnaKmers=loadLongKmersByType(kLongTRna, "tRNA");}
jpayne@68 155 loadedLongKmers=true;
jpayne@68 156 }
jpayne@68 157
jpayne@68 158 // private static LongHashSet loadLongKmers(StatsContainer sc, int k, String prefix){
jpayne@68 159 // String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa");
jpayne@68 160 // if(!new File(fname).exists()){
jpayne@68 161 // fname=fname+".gz";
jpayne@68 162 // if(!new File(fname).exists()){
jpayne@68 163 // System.err.println("Can't find "+fname);
jpayne@68 164 // return null;
jpayne@68 165 // }
jpayne@68 166 // }
jpayne@68 167 // LongHashSet set=loadLongKmers(fname, k);
jpayne@68 168 // sc.kmerSet=set;
jpayne@68 169 // sc.kLongLen=k;
jpayne@68 170 // return set;
jpayne@68 171 // }
jpayne@68 172
jpayne@68 173 private static LongHashSet loadLongKmersByType(int k, String prefix){
jpayne@68 174 String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa", true);
jpayne@68 175 if(!new File(fname).exists()){
jpayne@68 176 fname=fname+".gz";
jpayne@68 177 if(!new File(fname).exists()){
jpayne@68 178 System.err.println("Can't find "+fname);
jpayne@68 179 return null;
jpayne@68 180 }
jpayne@68 181 }
jpayne@68 182 LongHashSet set=loadLongKmers(fname, k);
jpayne@68 183 return set;
jpayne@68 184 }
jpayne@68 185
jpayne@68 186 private static LongHashSet loadLongKmers(String fname, int k){//TODO: Consider making this a LongHashSet. No reason not to...
jpayne@68 187 FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false);
jpayne@68 188 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1, false, ff, null);
jpayne@68 189 cris.start(); //Start the stream
jpayne@68 190 // if(verbose){outstream.println("Started cris");}
jpayne@68 191
jpayne@68 192 LongHashSet set=new LongHashSet(1000);
jpayne@68 193 ListNum<Read> ln=cris.nextList();
jpayne@68 194 while(ln!=null && ln.size()>0){
jpayne@68 195 processList(ln, set, k);
jpayne@68 196 cris.returnList(ln);
jpayne@68 197 ln=cris.nextList();
jpayne@68 198 }
jpayne@68 199 if(ln!=null){cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());}
jpayne@68 200 ReadWrite.closeStream(cris);
jpayne@68 201 return set;
jpayne@68 202 }
jpayne@68 203
jpayne@68 204 private static LongHashSet processList(ListNum<Read> ln, LongHashSet set, int k){
jpayne@68 205 final long mask=~((-1L)<<(2*k));
jpayne@68 206 for(Read r : ln){
jpayne@68 207 final byte[] bases=r.bases;
jpayne@68 208 long kmer=0;
jpayne@68 209 int len=0;
jpayne@68 210 for(byte b : bases){
jpayne@68 211 final int num=AminoAcid.baseToNumber[b];
jpayne@68 212 if(num>=0){
jpayne@68 213 len++;
jpayne@68 214 kmer=((kmer<<2)|num)&mask;
jpayne@68 215 if(len>=k){
jpayne@68 216 set.add(kmer);
jpayne@68 217 }
jpayne@68 218 }else{
jpayne@68 219 len=0;
jpayne@68 220 }
jpayne@68 221 }
jpayne@68 222 }
jpayne@68 223 return set;
jpayne@68 224 }
jpayne@68 225
jpayne@68 226 /*--------------------------------------------------------------*/
jpayne@68 227 /*---------------- Consensus Sequence ----------------*/
jpayne@68 228 /*--------------------------------------------------------------*/
jpayne@68 229
jpayne@68 230 public static synchronized void loadConsensusSequenceFromFile(boolean removeMito, boolean removeChloro){
jpayne@68 231 if(loadedConsensusSequence){return;}
jpayne@68 232 // assert(r16SSequence==null);
jpayne@68 233 if(load16SSequence){r16SSequence=loadConsensusSequenceType("16S", removeMito, removeChloro);}
jpayne@68 234 if(load18SSequence){r18SSequence=loadConsensusSequenceType("18S", removeMito, removeChloro);}
jpayne@68 235 if(load23SSequence){r23SSequence=loadConsensusSequenceType("23S", removeMito, removeChloro);}
jpayne@68 236 if(load5SSequence){r5SSequence=loadConsensusSequenceType("5S", removeMito, removeChloro);}
jpayne@68 237 if(loadtRNASequence){trnaSequence=loadConsensusSequenceType("tRNA", removeMito, removeChloro);}
jpayne@68 238 loadedConsensusSequence=true;
jpayne@68 239 }
jpayne@68 240
jpayne@68 241 public static Read[] loadConsensusSequenceType(String prefix, boolean removeMito, boolean removeChloro){
jpayne@68 242 String fname=null;
jpayne@68 243 fname=Data.findPath("?"+prefix+"_consensus_sequence.fq", false);
jpayne@68 244 if(fname!=null && (fname.endsWith(".jar") || new File(fname).exists())){
jpayne@68 245 fname=Tools.fixExtension(fname);
jpayne@68 246 }else{
jpayne@68 247 fname=Data.findPath("?"+prefix+"_consensus_sequence.fa", true);
jpayne@68 248 fname=Tools.fixExtension(fname);
jpayne@68 249 if(!fname.endsWith(".jar") && !new File(fname).exists()){
jpayne@68 250 System.err.println("Can't find "+fname);
jpayne@68 251 return null;
jpayne@68 252 }
jpayne@68 253 }
jpayne@68 254 Read[] array=loadConsensusSequence(fname);
jpayne@68 255 if(removeMito){array=stripOrganelle(array, "mito");}
jpayne@68 256 if(removeChloro){array=stripOrganelle(array, "plastid");}
jpayne@68 257 return array;
jpayne@68 258 }
jpayne@68 259
jpayne@68 260 private static Read[] loadConsensusSequence(String fname){
jpayne@68 261 FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false);
jpayne@68 262 Read[] array=ReadInputStream.toReadArray(ff, -1);
jpayne@68 263 return array;
jpayne@68 264 }
jpayne@68 265
jpayne@68 266 private static Read[] stripOrganelle(Read[] array, String key){
jpayne@68 267 int removed=0;
jpayne@68 268 for(int j=0; j<array.length; j++){
jpayne@68 269 if(array[j].id.toLowerCase().startsWith(key)) {
jpayne@68 270 array[j]=null;
jpayne@68 271 removed++;
jpayne@68 272 }
jpayne@68 273 }
jpayne@68 274 if(removed>0){array=Tools.condenseStrict(array);}
jpayne@68 275 return array;
jpayne@68 276 }
jpayne@68 277
jpayne@68 278 /*--------------------------------------------------------------*/
jpayne@68 279
jpayne@68 280 public static final int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
jpayne@68 281 public static String[] typeStrings=new String[] {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
jpayne@68 282 public static String[] typeStrings2=new String[] {"CDS", "tRNA", "rRNA", "rRNA", "rRNA", "rRNA", "rRNA", "RNA"};
jpayne@68 283 public static String[] specialTypeStrings=new String[] {null, "tRNA", "16S", "23S", "5S", "18S", "28S", null};
jpayne@68 284 public static boolean isSpecialType(String type){
jpayne@68 285 if(type==null){return false;}
jpayne@68 286 for(String s : specialTypeStrings){
jpayne@68 287 if(type.equalsIgnoreCase(s)){return true;}
jpayne@68 288 }
jpayne@68 289 return false;
jpayne@68 290 }
jpayne@68 291
jpayne@68 292 public static int kInnerRNA=6;
jpayne@68 293 public static int kStartRNA=3;
jpayne@68 294 public static int kStopRNA=3;
jpayne@68 295
jpayne@68 296 public static int kLongSSU=15;
jpayne@68 297 public static int kLongLSU=15;
jpayne@68 298 public static int kLong5S=15;
jpayne@68 299 public static int kLongTRna=15;
jpayne@68 300
jpayne@68 301 public static float min16SIdentity=0.62f;
jpayne@68 302 public static float min23SIdentity=0.60f;
jpayne@68 303 public static float min5SIdentity=0.60f;
jpayne@68 304 public static float min18SIdentity=0.60f;
jpayne@68 305
jpayne@68 306 static int ssuStartSlop=200;
jpayne@68 307 static int ssuStopSlop=0;
jpayne@68 308 static int lsuStartSlop=220;
jpayne@68 309 static int lsuStopSlop=0;
jpayne@68 310 static int r5SStartSlop=50;
jpayne@68 311 static int r5SStopSlop=50;
jpayne@68 312
jpayne@68 313 public static boolean callCDS=true;
jpayne@68 314 public static boolean calltRNA=true;
jpayne@68 315 public static boolean call16S=true;
jpayne@68 316 public static boolean call23S=true;
jpayne@68 317 public static boolean call5S=true;
jpayne@68 318 public static boolean call18S=false;
jpayne@68 319
jpayne@68 320 public static LongHashSet ssuKmers=null;
jpayne@68 321 public static LongHashSet lsuKmers=null;
jpayne@68 322 public static LongHashSet r5SKmers=null;
jpayne@68 323 public static LongHashSet trnaKmers=null;
jpayne@68 324
jpayne@68 325 public static Read[] trnaSequence=null;
jpayne@68 326 public static Read[] r16SSequence=null;
jpayne@68 327 public static Read[] r23SSequence=null;
jpayne@68 328 public static Read[] r5SSequence=null;
jpayne@68 329 public static Read[] r18SSequence=null;
jpayne@68 330
jpayne@68 331 public static boolean PROCESS_PLUS_STRAND=true;
jpayne@68 332 public static boolean PROCESS_MINUS_STRAND=true;
jpayne@68 333
jpayne@68 334 public static boolean loadSSUkmers=true;
jpayne@68 335 public static boolean loadLSUkmers=true;
jpayne@68 336 public static boolean load5Skmers=true;
jpayne@68 337 public static boolean loadtRNAkmers=true;
jpayne@68 338 private static boolean loadedLongKmers=false;
jpayne@68 339
jpayne@68 340 public static boolean loadtRNASequence=false;
jpayne@68 341 public static boolean load16SSequence=true;
jpayne@68 342 public static boolean load23SSequence=true;
jpayne@68 343 public static boolean load5SSequence=true;
jpayne@68 344 public static boolean load18SSequence=true;
jpayne@68 345 private static boolean loadedConsensusSequence=false;
jpayne@68 346
jpayne@68 347 }